pydatamax 0.1.16.post1__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. datamax/loader/core.py +67 -42
  2. datamax/loader/minio_handler.py +38 -19
  3. datamax/parser/__init__.py +2 -1
  4. datamax/parser/base.py +46 -22
  5. datamax/parser/core.py +215 -126
  6. datamax/parser/csv_parser.py +25 -5
  7. datamax/parser/doc_parser.py +230 -141
  8. datamax/parser/docx_parser.py +275 -186
  9. datamax/parser/epub_parser.py +49 -13
  10. datamax/parser/html_parser.py +36 -16
  11. datamax/parser/image_parser.py +52 -14
  12. datamax/parser/json_parser.py +26 -5
  13. datamax/parser/md_parser.py +40 -21
  14. datamax/parser/pdf_parser.py +69 -29
  15. datamax/parser/ppt_parser.py +41 -9
  16. datamax/parser/pptx_parser.py +49 -21
  17. datamax/parser/txt_parser.py +45 -14
  18. datamax/parser/xls_parser.py +34 -6
  19. datamax/parser/xlsx_parser.py +58 -51
  20. datamax/utils/__init__.py +2 -1
  21. datamax/utils/data_cleaner.py +36 -22
  22. datamax/utils/env_setup.py +25 -18
  23. datamax/utils/gotocr_pdf.py +13 -13
  24. datamax/utils/lifecycle_types.py +18 -0
  25. datamax/utils/mineru_operator.py +17 -15
  26. datamax/utils/paddleocr_pdf_operator.py +34 -19
  27. datamax/utils/ppt_extract.py +34 -11
  28. datamax/utils/qa_generator.py +332 -44
  29. datamax/utils/tokenizer.py +10 -9
  30. datamax/utils/uno_handler.py +84 -72
  31. {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
  32. pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
  33. pydatamax-0.1.16.post1.dist-info/RECORD +0 -38
  34. {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
  35. {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
  36. {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
datamax/loader/core.py CHANGED
@@ -1,27 +1,34 @@
1
1
  import os
2
2
  from typing import List
3
+
3
4
  from datamax.loader.minio_handler import MinIOClient
4
5
  from datamax.loader.oss_handler import OssClient
5
6
 
6
7
 
7
8
  class DataLoader:
8
- def __init__(self, endpoint: str = None, secret_key: str = None, access_key: str = None,
9
- bucket_name: str = None, source: str = None):
10
- if source and source == 'Oss':
9
+ def __init__(
10
+ self,
11
+ endpoint: str = None,
12
+ secret_key: str = None,
13
+ access_key: str = None,
14
+ bucket_name: str = None,
15
+ source: str = None,
16
+ ):
17
+ if source and source == "Oss":
11
18
  self.oss = OssClient(
12
19
  oss_endpoint=endpoint,
13
20
  oss_access_key_secret=secret_key,
14
21
  oss_access_key_id=access_key,
15
- oss_bucket_name=bucket_name
22
+ oss_bucket_name=bucket_name,
16
23
  )
17
- elif source and source == 'MinIO':
24
+ elif source and source == "MinIO":
18
25
  self.mi = MinIOClient(
19
26
  endpoint=endpoint,
20
27
  secret_key=secret_key,
21
28
  access_key=access_key,
22
- bucket_name=bucket_name
29
+ bucket_name=bucket_name,
23
30
  )
24
- self.download_path = str('./download_file')
31
+ self.download_path = str("./download_file")
25
32
  self.source = source
26
33
  self.bucket_name = bucket_name
27
34
 
@@ -37,16 +44,11 @@ class DataLoader:
37
44
  return []
38
45
  elif os.path.isdir(local_file_path):
39
46
  access_path = []
40
- for root, dirs, files in os.walk(local_file_path):
41
- for file in files:
42
- file_path = os.path.join(root, file)
43
- if os.path.exists(file_path):
44
- if os.access(file_path, os.R_OK):
45
- access_path.append(file_path)
46
- else:
47
- continue
48
- else:
49
- continue
47
+ # Recursively process all files and subdirectories under the current directory.
48
+ for item in os.listdir(local_file_path):
49
+ item_path = os.path.join(local_file_path, item)
50
+ item_results = DataLoader.load_from_file(item_path)
51
+ access_path.extend(item_results)
50
52
  return access_path
51
53
  else:
52
54
  return []
@@ -71,49 +73,72 @@ class DataLoader:
71
73
  return success_file_list
72
74
 
73
75
  def download(self, oss_path: str):
74
- if self.source == 'MinIO':
75
- file_list = self.mi.list_objects(bucket_name=self.bucket_name, prefix=oss_path)
76
+ if self.source == "MinIO":
77
+ file_list = self.mi.list_objects(
78
+ bucket_name=self.bucket_name, prefix=oss_path
79
+ )
76
80
  for path in file_list:
77
- self.mi.download_file(bucket_name=self.bucket_name, object_name=path,
78
- file_path=f'{self.download_path}/{path.split("/")[-1]}')
81
+ self.mi.download_file(
82
+ bucket_name=self.bucket_name,
83
+ object_name=path,
84
+ file_path=f'{self.download_path}/{path.split("/")[-1]}',
85
+ )
79
86
  elif self.source == "Oss":
80
87
  keys = self.oss.get_objects_in_folders(prefix=oss_path)
81
88
  for path in keys:
82
- self.oss.get_object_to_file(object_name=path,
83
- file_path=f'{self.download_path}/{path.split("/")[-1]}')
89
+ self.oss.get_object_to_file(
90
+ object_name=path,
91
+ file_path=f'{self.download_path}/{path.split("/")[-1]}',
92
+ )
84
93
 
85
94
  def upload(self, local_file_path: str, save_prefix: str):
86
- if self.source == 'MinIO':
95
+ if self.source == "MinIO":
87
96
  if os.path.isdir(local_file_path):
88
97
  for root, dirs, files in os.walk(local_file_path):
89
98
  for file in files:
90
99
  file_path = os.path.join(root, file)
91
- self.mi.upload_file(bucket_name=self.bucket_name, object_name=save_prefix + f'{file}',
92
- file_path=file_path)
100
+ self.mi.upload_file(
101
+ bucket_name=self.bucket_name,
102
+ object_name=save_prefix + f"{file}",
103
+ file_path=file_path,
104
+ )
93
105
  elif os.path.isfile(local_file_path):
94
- self.mi.upload_file(bucket_name=self.bucket_name,
95
- object_name=save_prefix + os.path.basename(local_file_path),
96
- file_path=local_file_path)
106
+ self.mi.upload_file(
107
+ bucket_name=self.bucket_name,
108
+ object_name=save_prefix + os.path.basename(local_file_path),
109
+ file_path=local_file_path,
110
+ )
97
111
  else:
98
112
  pass
99
113
 
100
114
  elif self.source == "Oss":
101
115
  if os.path.isdir(local_file_path):
102
- self.oss.put_object_from_folder(object_folder_name=save_prefix, local_folder_path=local_file_path)
116
+ self.oss.put_object_from_folder(
117
+ object_folder_name=save_prefix, local_folder_path=local_file_path
118
+ )
103
119
  elif os.path.isfile(local_file_path):
104
- self.oss.put_object_from_file(object_name=save_prefix + os.path.basename(local_file_path),
105
- file_path=local_file_path)
120
+ self.oss.put_object_from_file(
121
+ object_name=save_prefix + os.path.basename(local_file_path),
122
+ file_path=local_file_path,
123
+ )
106
124
  else:
107
125
  pass
108
126
 
109
- def share(self, oss_path: str,
110
- expires: int = None,
111
- aliyun_oss_url_prefix: str = None,
112
- csnt_url_prefix: str = None):
113
- if self.source == 'MinIO':
114
- return self.mi.get_object_tmp_link(bucket_name=self.bucket_name, object_name=oss_path, expires=expires)
127
+ def share(
128
+ self,
129
+ oss_path: str,
130
+ expires: int = None,
131
+ aliyun_oss_url_prefix: str = None,
132
+ csnt_url_prefix: str = None,
133
+ ):
134
+ if self.source == "MinIO":
135
+ return self.mi.get_object_tmp_link(
136
+ bucket_name=self.bucket_name, object_name=oss_path, expires=expires
137
+ )
115
138
  elif self.source == "Oss":
116
- return self.oss.get_oss_url(object_name=oss_path,
117
- url_expires_time=expires,
118
- aliyun_oss_url_prefix=aliyun_oss_url_prefix,
119
- csnt_url_prefix=csnt_url_prefix)
139
+ return self.oss.get_oss_url(
140
+ object_name=oss_path,
141
+ url_expires_time=expires,
142
+ aliyun_oss_url_prefix=aliyun_oss_url_prefix,
143
+ csnt_url_prefix=csnt_url_prefix,
144
+ )
@@ -1,11 +1,12 @@
1
1
  import os
2
- from dotenv import load_dotenv
2
+ import re
3
3
  from datetime import timedelta
4
+
5
+ from dotenv import load_dotenv
6
+ from loguru import logger
4
7
  from minio import Minio
5
8
  from minio.commonconfig import Tags
6
9
  from minio.error import S3Error
7
- from loguru import logger
8
- import re
9
10
 
10
11
  load_dotenv()
11
12
 
@@ -25,7 +26,7 @@ class MinIOClient:
25
26
  self.endpoint,
26
27
  access_key=self.access_key,
27
28
  secret_key=self.secret_key,
28
- secure=self.secure
29
+ secure=self.secure,
29
30
  )
30
31
  return client
31
32
  except S3Error as e:
@@ -55,7 +56,9 @@ class MinIOClient:
55
56
  if self.client:
56
57
  try:
57
58
  self.client.fput_object(bucket_name, object_name, file_path)
58
- logger.info(f"File '{file_path}' uploaded to bucket '{bucket_name}' as '{object_name}'.")
59
+ logger.info(
60
+ f"File '{file_path}' uploaded to bucket '{bucket_name}' as '{object_name}'."
61
+ )
59
62
  except S3Error as e:
60
63
  raise
61
64
 
@@ -63,15 +66,18 @@ class MinIOClient:
63
66
  if self.client:
64
67
  try:
65
68
  self.client.fget_object(bucket_name, object_name, file_path)
66
- logger.info(f"Object '{object_name}' from bucket '{bucket_name}' downloaded to '{file_path}'.")
69
+ logger.info(
70
+ f"Object '{object_name}' from bucket '{bucket_name}' downloaded to '{file_path}'."
71
+ )
67
72
  return file_path
68
73
  except Exception as e:
69
74
  try:
70
75
  illegal_chars = r'[\/:*?"<>|]'
71
- file_path = re.sub(illegal_chars, '_', file_path)
76
+ file_path = re.sub(illegal_chars, "_", file_path)
72
77
  self.client.fget_object(bucket_name, object_name, file_path)
73
78
  logger.info(
74
- f"Object {object_name} from bucket {bucket_name} downloaded to {file_path}'.")
79
+ f"Object {object_name} from bucket {bucket_name} downloaded to {file_path}'."
80
+ )
75
81
  return file_path
76
82
  except Exception as e:
77
83
  raise
@@ -81,7 +87,9 @@ class MinIOClient:
81
87
  try:
82
88
  result_list = []
83
89
  if prefix:
84
- objects = self.client.list_objects(bucket_name, recursive=True, prefix=prefix)
90
+ objects = self.client.list_objects(
91
+ bucket_name, recursive=True, prefix=prefix
92
+ )
85
93
  else:
86
94
  objects = self.client.list_objects(bucket_name, recursive=True)
87
95
  logger.info(f"Objects in bucket '{bucket_name}':")
@@ -99,8 +107,7 @@ class MinIOClient:
99
107
  raise
100
108
 
101
109
  def calculate_bucket_stats(self, bucket_name, prefix):
102
- objects = self.client.list_objects(bucket_name,
103
- prefix=prefix, recursive=True)
110
+ objects = self.client.list_objects(bucket_name, prefix=prefix, recursive=True)
104
111
  total_size = 0
105
112
  object_count = 0
106
113
 
@@ -115,14 +122,16 @@ class MinIOClient:
115
122
  def get_objects(self, bucket_name, object_name):
116
123
  try:
117
124
  response = self.client.get_object(bucket_name, object_name)
118
- content = response.read().decode('utf-8')
125
+ content = response.read().decode("utf-8")
119
126
  return content
120
127
  except Exception as e:
121
128
  raise
122
129
 
123
130
  def get_object_tag(self, bucket_name, object_name):
124
131
  try:
125
- tags = self.client.get_object_tags(bucket_name=bucket_name, object_name=object_name)
132
+ tags = self.client.get_object_tags(
133
+ bucket_name=bucket_name, object_name=object_name
134
+ )
126
135
  return tags
127
136
  except Exception as e:
128
137
  raise
@@ -130,7 +139,9 @@ class MinIOClient:
130
139
  def update_object_tag(self, bucket_name, object_name, tags):
131
140
  try:
132
141
  tags_obj = Tags.new_object_tags()
133
- tag_info = self.get_object_tag(bucket_name=bucket_name, object_name=object_name)
142
+ tag_info = self.get_object_tag(
143
+ bucket_name=bucket_name, object_name=object_name
144
+ )
134
145
  if tag_info is None:
135
146
  tag_info = {}
136
147
  for tag_dict in tags:
@@ -142,7 +153,9 @@ class MinIOClient:
142
153
 
143
154
  for k, v in tag_info.items():
144
155
  tags_obj[k] = v
145
- self.client.set_object_tags(bucket_name=bucket_name, object_name=object_name, tags=tags_obj)
156
+ self.client.set_object_tags(
157
+ bucket_name=bucket_name, object_name=object_name, tags=tags_obj
158
+ )
146
159
  else:
147
160
  for tag_dict in tags:
148
161
  for tag_key, tag_value in tag_dict.items():
@@ -153,20 +166,26 @@ class MinIOClient:
153
166
 
154
167
  for k, v in tag_info.items():
155
168
  tags_obj[k] = v
156
- self.client.set_object_tags(bucket_name=bucket_name, object_name=object_name, tags=tags_obj)
169
+ self.client.set_object_tags(
170
+ bucket_name=bucket_name, object_name=object_name, tags=tags_obj
171
+ )
157
172
  return tag_info
158
173
  except Exception as e:
159
174
  raise
160
175
 
161
176
  def reset_object_tag(self, bucket_name, object_name):
162
177
  try:
163
- self.client.delete_object_tags(bucket_name=bucket_name, object_name=object_name)
178
+ self.client.delete_object_tags(
179
+ bucket_name=bucket_name, object_name=object_name
180
+ )
164
181
  return True
165
182
  except Exception as e:
166
183
  raise
167
184
 
168
185
  def get_object_tmp_link(self, bucket_name, object_name, expires):
169
186
  try:
170
- return self.client.presigned_get_object(bucket_name, object_name, expires=timedelta(days=expires))
187
+ return self.client.presigned_get_object(
188
+ bucket_name, object_name, expires=timedelta(days=expires)
189
+ )
171
190
  except Exception as e:
172
- raise
191
+ raise
@@ -1,2 +1,3 @@
1
+ from loguru import logger
2
+
1
3
  from .core import DataMax
2
- from loguru import logger
datamax/parser/base.py CHANGED
@@ -1,7 +1,9 @@
1
1
  import os
2
2
  from datetime import datetime
3
3
  from pathlib import Path
4
- from typing import List, Dict
4
+ from typing import Dict, List, Union
5
+
6
+ from datamax.utils.lifecycle_types import LifeType
5
7
  from datamax.utils.tokenizer import DashScopeClient
6
8
 
7
9
 
@@ -10,7 +12,9 @@ class LifeCycle:
10
12
  Life cycle class
11
13
  """
12
14
 
13
- def __init__(self, update_time: str, life_type: list, life_metadata: Dict[str, str]):
15
+ def __init__(
16
+ self, update_time: str, life_type: list, life_metadata: Dict[str, str]
17
+ ):
14
18
  self.update_time = update_time # Update time
15
19
  self.life_type = life_type # Life cycle type
16
20
  self.life_metadata = life_metadata # Life cycle metadata
@@ -21,14 +25,14 @@ class LifeCycle:
21
25
  self.life_metadata.update(life_metadata)
22
26
 
23
27
  def __str__(self):
24
- metadata_str = ', '.join(f'{k}: {v}' for k, v in self.life_metadata.items())
25
- return f'update_time: {self.update_time}, life_type: {self.life_type}, life_metadata: {{{metadata_str}}}'
28
+ metadata_str = ", ".join(f"{k}: {v}" for k, v in self.life_metadata.items())
29
+ return f"update_time: {self.update_time}, life_type: {self.life_type}, life_metadata: {{{metadata_str}}}"
26
30
 
27
31
  def to_dict(self):
28
32
  return {
29
- 'update_time': self.update_time,
30
- 'life_type': self.life_type,
31
- 'life_metadata': self.life_metadata
33
+ "update_time": self.update_time,
34
+ "life_type": self.life_type,
35
+ "life_metadata": self.life_metadata,
32
36
  }
33
37
 
34
38
 
@@ -37,8 +41,8 @@ class MarkdownOutputVo:
37
41
  Markdown output conversion
38
42
  """
39
43
 
40
- def __init__(self, title: str, content: str):
41
- self.title: str = title # File type
44
+ def __init__(self, extension: str, content: str):
45
+ self.extension: str = extension # File type
42
46
  self.content: str = content # Markdown content
43
47
  self.lifecycle: List[LifeCycle] = [] # Life cycle data
44
48
 
@@ -47,9 +51,9 @@ class MarkdownOutputVo:
47
51
 
48
52
  def to_dict(self):
49
53
  data_dict = {
50
- 'title': self.title,
51
- 'content': self.content,
52
- 'lifecycle': [lc.to_dict() for lc in self.lifecycle]
54
+ "extension": self.extension,
55
+ "content": self.content,
56
+ "lifecycle": [lc.to_dict() for lc in self.lifecycle],
53
57
  }
54
58
  return data_dict
55
59
 
@@ -58,20 +62,40 @@ class BaseLife:
58
62
  tk_client = DashScopeClient()
59
63
 
60
64
  @staticmethod
61
- def generate_lifecycle(source_file, domain, life_type, usage_purpose) -> LifeCycle:
65
+ def generate_lifecycle(
66
+ source_file: str,
67
+ domain: str,
68
+ life_type: Union[LifeType, str, List[Union[LifeType, str]]],
69
+ usage_purpose: str,
70
+ ) -> LifeCycle:
71
+ """
72
+ 构造一个 LifeCycle 记录,可以传入单个枚举/字符串或列表混合
73
+ """
74
+ # 1) 先统一成 list
75
+ if isinstance(life_type, (list, tuple)):
76
+ raw = list(life_type)
77
+ else:
78
+ raw = [life_type]
79
+
80
+ # 2) 如果是枚举,就取它的 value
81
+ life_list: List[str] = [
82
+ lt.value if isinstance(lt, LifeType) else lt for lt in raw
83
+ ]
84
+
62
85
  update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
63
- life_type = [life_type]
64
- storage = os.stat(source_file)
86
+ try:
87
+ storage = os.path.getsize(source_file)
88
+ except Exception:
89
+ storage = 0
65
90
  life_metadata = {
66
- # "token_count": token_count, # Token count of the text
67
- "storage_size": storage.st_size, # Storage size in bytes
68
- "source_file": source_file, # Source file
69
- "domain": domain, # Domain
70
- "usage_purpose": usage_purpose # Usage purpose
91
+ "storage_size": storage,
92
+ "source_file": source_file,
93
+ "domain": domain,
94
+ "usage_purpose": usage_purpose,
71
95
  }
72
- return LifeCycle(update_time, life_type, life_metadata)
96
+ return LifeCycle(update_time, life_list, life_metadata)
73
97
 
74
98
  @staticmethod
75
99
  def get_file_extension(file_path):
76
100
  file_path = Path(file_path)
77
- return file_path.suffix[1:].lower()
101
+ return file_path.suffix[1:].lower()