pydatamax 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. datamax/__init__.py +1 -1
  2. datamax/loader/core.py +118 -118
  3. datamax/loader/{MinioHandler.py → minio_handler.py} +171 -171
  4. datamax/loader/{OssHandler.py → oss_handler.py} +191 -191
  5. datamax/parser/__init__.py +2 -4
  6. datamax/parser/base.py +76 -76
  7. datamax/parser/core.py +406 -288
  8. datamax/parser/csv_parser.py +31 -10
  9. datamax/parser/doc_parser.py +525 -61
  10. datamax/parser/docx_parser.py +512 -62
  11. datamax/parser/epub_parser.py +41 -41
  12. datamax/parser/html_parser.py +37 -37
  13. datamax/parser/image_parser.py +34 -34
  14. datamax/parser/json_parser.py +32 -10
  15. datamax/parser/md_parser.py +72 -72
  16. datamax/parser/pdf_parser.py +101 -101
  17. datamax/parser/ppt_parser.py +70 -20
  18. datamax/parser/pptx_parser.py +45 -45
  19. datamax/parser/txt_parser.py +45 -45
  20. datamax/parser/xls_parser.py +26 -26
  21. datamax/parser/xlsx_parser.py +212 -208
  22. datamax/utils/__init__.py +23 -2
  23. datamax/utils/constants.py +58 -58
  24. datamax/utils/data_cleaner.py +275 -237
  25. datamax/utils/env_setup.py +79 -79
  26. datamax/utils/gotocr_pdf.py +265 -265
  27. datamax/utils/mineru_operator.py +62 -62
  28. datamax/utils/paddleocr_pdf_operator.py +90 -90
  29. datamax/utils/ppt_extract.py +140 -140
  30. datamax/utils/qa_generator.py +369 -376
  31. datamax/utils/tokenizer.py +21 -21
  32. datamax/utils/uno_handler.py +426 -0
  33. pydatamax-0.1.15.dist-info/METADATA +340 -0
  34. pydatamax-0.1.15.dist-info/RECORD +38 -0
  35. {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
  36. {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
  37. pydatamax-0.1.13.dist-info/METADATA +0 -280
  38. pydatamax-0.1.13.dist-info/RECORD +0 -39
  39. tests/__init__.py +0 -0
  40. tests/test_basic.py +0 -20
  41. {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
@@ -1,41 +1,83 @@
1
1
  import os
2
2
  import shutil
3
- import chardet
4
3
  import subprocess
5
4
  import tempfile
6
5
  from pathlib import Path
7
6
  from typing import Union
8
- from datamax.parser.base import BaseLife
9
- from datamax.parser.base import MarkdownOutputVo
7
+
8
+ import chardet
9
+
10
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
10
11
  from datamax.utils.ppt_extract import PPtExtractor
11
12
 
13
+ # 尝试导入UNO处理器
14
+ try:
15
+ from datamax.utils.uno_handler import HAS_UNO, convert_with_uno
16
+ except ImportError:
17
+ HAS_UNO = False
18
+
12
19
 
13
20
  class PPtParser(BaseLife):
14
- def __init__(self, file_path: Union[str, list]):
21
+ def __init__(self, file_path: Union[str, list], use_uno: bool = None):
15
22
  super().__init__()
16
23
  self.file_path = file_path
17
24
 
25
+ # 自动检测是否使用UNO(如果未指定)
26
+ if use_uno is None:
27
+ self.use_uno = HAS_UNO
28
+ else:
29
+ self.use_uno = use_uno and HAS_UNO
30
+
18
31
  def ppt_to_pptx(self, ppt_path: str, dir_path: str) -> str:
32
+ if self.use_uno:
33
+ # 使用UNO API进行转换
34
+ try:
35
+ pptx_path = convert_with_uno(ppt_path, "pptx", dir_path)
36
+
37
+ if not os.path.exists(pptx_path):
38
+ raise Exception(
39
+ f"> !!! File conversion failed {ppt_path} ==> {pptx_path}"
40
+ )
41
+ else:
42
+ return pptx_path
43
+
44
+ except Exception as e:
45
+ if (
46
+ hasattr(self, "_fallback_to_subprocess")
47
+ and self._fallback_to_subprocess
48
+ ):
49
+ return self._ppt_to_pptx_subprocess(ppt_path, dir_path)
50
+ raise
51
+ else:
52
+ # 使用传统的subprocess方式
53
+ return self._ppt_to_pptx_subprocess(ppt_path, dir_path)
54
+
55
+ def _ppt_to_pptx_subprocess(self, ppt_path: str, dir_path: str) -> str:
56
+ """使用subprocess将.ppt文件转换为.pptx文件(传统方式)"""
19
57
  cmd = f'soffice --headless --convert-to pptx "{ppt_path}" --outdir "{dir_path}"'
20
- process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
58
+ process = subprocess.Popen(
59
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
60
+ )
21
61
  stdout, stderr = process.communicate()
22
62
  exit_code = process.returncode
23
63
  if exit_code == 0:
24
64
  pass
25
65
  else:
26
- encoding = chardet.detect(stderr)['encoding']
66
+ encoding = chardet.detect(stderr)["encoding"]
27
67
  if encoding is None:
28
- encoding = 'utf-8'
29
- raise Exception(f"Error Output (detected encoding: {encoding}):", stderr.decode(encoding, errors='replace'))
68
+ encoding = "utf-8"
69
+ raise Exception(
70
+ f"Error Output (detected encoding: {encoding}):",
71
+ stderr.decode(encoding, errors="replace"),
72
+ )
30
73
  fname = str(Path(ppt_path).stem)
31
- pptx_path = os.path.join(os.path.dirname(ppt_path), f'{fname}.pptx')
74
+ pptx_path = os.path.join(os.path.dirname(ppt_path), f"{fname}.pptx")
32
75
  if not os.path.exists(pptx_path):
33
76
  raise Exception(f"> !!! File conversion failed {ppt_path} ==> {pptx_path}")
34
77
  else:
35
78
  return pptx_path
36
79
 
37
80
  def read_ppt_file(self, file_path: str):
38
-
39
81
  try:
40
82
  with tempfile.TemporaryDirectory() as temp_path:
41
83
  temp_dir = Path(temp_path).resolve()
@@ -43,17 +85,21 @@ class PPtParser(BaseLife):
43
85
  media_dir.mkdir()
44
86
  tmp_file_path = temp_dir / "tmp.ppt"
45
87
  shutil.copy(file_path, tmp_file_path)
46
- pptx_file_path = self.ppt_to_pptx(ppt_path=str(tmp_file_path), dir_path=temp_path)
88
+ pptx_file_path = self.ppt_to_pptx(
89
+ ppt_path=str(tmp_file_path), dir_path=temp_path
90
+ )
47
91
  pptx_extractor = PPtExtractor()
48
- pages_list = pptx_extractor.extract(Path(pptx_file_path), "tmp", temp_dir, media_dir, True)
49
- contents = ''
92
+ pages_list = pptx_extractor.extract(
93
+ Path(pptx_file_path), "tmp", temp_dir, media_dir, True
94
+ )
95
+ contents = ""
50
96
  for index, page in enumerate(pages_list):
51
- page_content_list = page['content_list']
97
+ page_content_list = page["content_list"]
52
98
  for content in page_content_list:
53
- if content['type'] == 'image':
99
+ if content["type"] == "image":
54
100
  pass
55
- elif content['type'] == "text":
56
- data = content['data']
101
+ elif content["type"] == "text":
102
+ data = content["data"]
57
103
  contents += data
58
104
  return contents
59
105
  except Exception:
@@ -61,12 +107,16 @@ class PPtParser(BaseLife):
61
107
 
62
108
  def parse(self, file_path: str) -> MarkdownOutputVo:
63
109
  try:
64
- title = self.get_file_extension(file_path)
110
+ title = os.path.splitext(os.path.basename(file_path))[0]
65
111
  content = self.read_ppt_file(file_path=file_path)
66
112
  # clean_text = clean_original_text(content)
67
113
  mk_content = content
68
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
69
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
114
+ lifecycle = self.generate_lifecycle(
115
+ source_file=file_path,
116
+ domain="Technology",
117
+ usage_purpose="Documentation",
118
+ life_type="LLM_ORIGIN",
119
+ )
70
120
  output_vo = MarkdownOutputVo(title, mk_content)
71
121
  output_vo.add_lifecycle(lifecycle)
72
122
  return output_vo.to_dict()
@@ -1,45 +1,45 @@
1
- import os
2
- from typing import Union
3
- from pptx import Presentation
4
- from datamax.parser.base import BaseLife
5
- from datamax.parser.base import MarkdownOutputVo
6
-
7
-
8
- class PPtxParser(BaseLife):
9
- def __init__(self, file_path: Union[str, list]):
10
- super().__init__()
11
- self.file_path = file_path
12
-
13
- @staticmethod
14
- def read_ppt_file(file_path: str):
15
- try:
16
- content = ''
17
- prs = Presentation(file_path)
18
- for slide in prs.slides:
19
- for shape in slide.shapes:
20
- if shape.has_text_frame:
21
- content += shape.text + '\n'
22
- # if shape.shape_type == 13:
23
- # if not os.path.exists("extracted_images"):
24
- # os.makedirs("extracted_images")
25
- # image = shape.image
26
- # image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
27
- # with open(image_filename, 'wb') as img_file:
28
- # img_file.write(image.blob)
29
- # content += ('[' + image_filename + ']')
30
- return content
31
- except Exception:
32
- raise
33
-
34
- def parse(self, file_path: str) -> MarkdownOutputVo:
35
- try:
36
- title = self.get_file_extension(file_path)
37
- content = self.read_ppt_file(file_path=file_path)
38
- mk_content = content
39
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
40
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
41
- output_vo = MarkdownOutputVo(title, mk_content)
42
- output_vo.add_lifecycle(lifecycle)
43
- return output_vo.to_dict()
44
- except Exception:
45
- raise
1
+ import os
2
+ from typing import Union
3
+ from pptx import Presentation
4
+ from datamax.parser.base import BaseLife
5
+ from datamax.parser.base import MarkdownOutputVo
6
+
7
+
8
+ class PPtxParser(BaseLife):
9
+ def __init__(self, file_path: Union[str, list]):
10
+ super().__init__()
11
+ self.file_path = file_path
12
+
13
+ @staticmethod
14
+ def read_ppt_file(file_path: str):
15
+ try:
16
+ content = ''
17
+ prs = Presentation(file_path)
18
+ for slide in prs.slides:
19
+ for shape in slide.shapes:
20
+ if shape.has_text_frame:
21
+ content += shape.text + '\n'
22
+ # if shape.shape_type == 13:
23
+ # if not os.path.exists("extracted_images"):
24
+ # os.makedirs("extracted_images")
25
+ # image = shape.image
26
+ # image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
27
+ # with open(image_filename, 'wb') as img_file:
28
+ # img_file.write(image.blob)
29
+ # content += ('[' + image_filename + ']')
30
+ return content
31
+ except Exception:
32
+ raise
33
+
34
+ def parse(self, file_path: str) -> MarkdownOutputVo:
35
+ try:
36
+ title = os.path.splitext(os.path.basename(file_path))[0]
37
+ content = self.read_ppt_file(file_path=file_path)
38
+ mk_content = content
39
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
40
+ usage_purpose="Documentation", life_type="LLM_ORIGIN")
41
+ output_vo = MarkdownOutputVo(title, mk_content)
42
+ output_vo.add_lifecycle(lifecycle)
43
+ return output_vo.to_dict()
44
+ except Exception:
45
+ raise
@@ -1,46 +1,46 @@
1
- import chardet
2
- from typing import Union
3
- from datamax.parser.base import BaseLife
4
- from datamax.parser.base import MarkdownOutputVo
5
-
6
-
7
- class TxtParser(BaseLife):
8
- def __init__(self, file_path: Union[str, list]):
9
- super().__init__()
10
- self.file_path = file_path
11
-
12
- @staticmethod
13
- def detect_encoding(file_path: str):
14
- try:
15
- with open(file_path, 'rb') as f:
16
- result = chardet.detect(f.read())
17
- return result['encoding']
18
- except Exception as e:
19
- raise e
20
-
21
- @staticmethod
22
- def read_txt_file(file_path: str) -> str:
23
- """
24
- Reads the Txt file in the specified path and returns its contents.
25
- :param file_path: indicates the path of the Txt file to be read.
26
- :return: str: Txt file contents.
27
- """
28
- try:
29
- encoding = TxtParser.detect_encoding(file_path)
30
- with open(file_path, 'r', encoding=encoding) as file:
31
- return file.read()
32
- except Exception as e:
33
- raise e
34
-
35
- def parse(self, file_path: str) -> MarkdownOutputVo:
36
- try:
37
- title = self.get_file_extension(file_path)
38
- content = self.read_txt_file(file_path=file_path) # 真实数据是从load加载
39
- mk_content = content
40
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
41
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
42
- output_vo = MarkdownOutputVo(title, mk_content)
43
- output_vo.add_lifecycle(lifecycle)
44
- return output_vo.to_dict()
45
- except Exception as e:
1
+ import chardet
2
+ from typing import Union
3
+ from datamax.parser.base import BaseLife
4
+ from datamax.parser.base import MarkdownOutputVo
5
+ import os
6
+
7
+ class TxtParser(BaseLife):
8
+ def __init__(self, file_path: Union[str, list]):
9
+ super().__init__()
10
+ self.file_path = file_path
11
+
12
+ @staticmethod
13
+ def detect_encoding(file_path: str):
14
+ try:
15
+ with open(file_path, 'rb') as f:
16
+ result = chardet.detect(f.read())
17
+ return result['encoding']
18
+ except Exception as e:
19
+ raise e
20
+
21
+ @staticmethod
22
+ def read_txt_file(file_path: str) -> str:
23
+ """
24
+ Reads the Txt file in the specified path and returns its contents.
25
+ :param file_path: indicates the path of the Txt file to be read.
26
+ :return: str: Txt file contents.
27
+ """
28
+ try:
29
+ encoding = TxtParser.detect_encoding(file_path)
30
+ with open(file_path, 'r', encoding=encoding) as file:
31
+ return file.read()
32
+ except Exception as e:
33
+ raise e
34
+
35
+ def parse(self, file_path: str) -> MarkdownOutputVo:
36
+ try:
37
+ title = os.path.splitext(os.path.basename(file_path))[0]
38
+ content = self.read_txt_file(file_path=file_path) # 真实数据是从load加载
39
+ mk_content = content
40
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
41
+ usage_purpose="Documentation", life_type="LLM_ORIGIN")
42
+ output_vo = MarkdownOutputVo(title, mk_content)
43
+ output_vo.add_lifecycle(lifecycle)
44
+ return output_vo.to_dict()
45
+ except Exception as e:
46
46
  raise e
@@ -1,26 +1,26 @@
1
- from datamax.parser.base import MarkdownOutputVo
2
- from datamax.parser.base import BaseLife
3
- import pandas as pd
4
- import warnings
5
-
6
- warnings.filterwarnings("ignore")
7
-
8
-
9
- class XlsParser(BaseLife):
10
- """xlsx or xls table use markitdown from Microsoft so magic for table!"""
11
-
12
- def __init__(self, file_path):
13
- super().__init__()
14
- self.file_path = file_path
15
-
16
- def parse(self, file_path: str) -> MarkdownOutputVo:
17
- try:
18
- df = pd.read_excel(file_path)
19
- mk_content = df.to_markdown(index=False)
20
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
21
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
22
- output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
23
- output_vo.add_lifecycle(lifecycle)
24
- return output_vo.to_dict()
25
- except Exception as e:
26
- raise e
1
+ from datamax.parser.base import MarkdownOutputVo
2
+ from datamax.parser.base import BaseLife
3
+ import pandas as pd
4
+ import warnings
5
+
6
+ warnings.filterwarnings("ignore")
7
+
8
+
9
+ class XlsParser(BaseLife):
10
+ """xlsx or xls table use markitdown from Microsoft so magic for table!"""
11
+
12
+ def __init__(self, file_path):
13
+ super().__init__()
14
+ self.file_path = file_path
15
+
16
+ def parse(self, file_path: str) -> MarkdownOutputVo:
17
+ try:
18
+ df = pd.read_excel(file_path)
19
+ mk_content = df.to_markdown(index=False)
20
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
21
+ usage_purpose="Documentation", life_type="LLM_ORIGIN")
22
+ output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
23
+ output_vo.add_lifecycle(lifecycle)
24
+ return output_vo.to_dict()
25
+ except Exception as e:
26
+ raise e