pydatamax 0.1.16.post1__tar.gz → 0.1.16.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. {pydatamax-0.1.16.post1/pydatamax.egg-info → pydatamax-0.1.16.post2}/PKG-INFO +54 -2
  2. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/README.md +53 -1
  3. pydatamax-0.1.16.post2/datamax/loader/core.py +144 -0
  4. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/loader/minio_handler.py +38 -19
  5. pydatamax-0.1.16.post2/datamax/parser/__init__.py +3 -0
  6. pydatamax-0.1.16.post2/datamax/parser/base.py +101 -0
  7. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/parser/core.py +215 -126
  8. pydatamax-0.1.16.post2/datamax/parser/csv_parser.py +51 -0
  9. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/parser/doc_parser.py +230 -141
  10. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/parser/docx_parser.py +275 -186
  11. pydatamax-0.1.16.post2/datamax/parser/epub_parser.py +77 -0
  12. pydatamax-0.1.16.post2/datamax/parser/html_parser.py +58 -0
  13. pydatamax-0.1.16.post2/datamax/parser/image_parser.py +72 -0
  14. pydatamax-0.1.16.post2/datamax/parser/json_parser.py +53 -0
  15. pydatamax-0.1.16.post2/datamax/parser/md_parser.py +92 -0
  16. pydatamax-0.1.16.post2/datamax/parser/pdf_parser.py +141 -0
  17. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/parser/ppt_parser.py +41 -9
  18. pydatamax-0.1.16.post2/datamax/parser/pptx_parser.py +73 -0
  19. pydatamax-0.1.16.post2/datamax/parser/txt_parser.py +77 -0
  20. pydatamax-0.1.16.post2/datamax/parser/xls_parser.py +54 -0
  21. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/parser/xlsx_parser.py +58 -51
  22. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/utils/__init__.py +2 -1
  23. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/utils/data_cleaner.py +36 -22
  24. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/utils/env_setup.py +25 -18
  25. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/utils/gotocr_pdf.py +13 -13
  26. pydatamax-0.1.16.post2/datamax/utils/lifecycle_types.py +18 -0
  27. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/utils/mineru_operator.py +17 -15
  28. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/utils/paddleocr_pdf_operator.py +34 -19
  29. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/utils/ppt_extract.py +34 -11
  30. pydatamax-0.1.16.post2/datamax/utils/qa_generator.py +657 -0
  31. pydatamax-0.1.16.post2/datamax/utils/tokenizer.py +23 -0
  32. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/utils/uno_handler.py +84 -72
  33. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2/pydatamax.egg-info}/PKG-INFO +54 -2
  34. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/pydatamax.egg-info/SOURCES.txt +1 -0
  35. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/setup.py +1 -1
  36. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/tests/test_doc_parser.py +3 -3
  37. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/tests/test_docx_parser.py +5 -5
  38. pydatamax-0.1.16.post1/datamax/loader/core.py +0 -119
  39. pydatamax-0.1.16.post1/datamax/parser/__init__.py +0 -2
  40. pydatamax-0.1.16.post1/datamax/parser/base.py +0 -77
  41. pydatamax-0.1.16.post1/datamax/parser/csv_parser.py +0 -31
  42. pydatamax-0.1.16.post1/datamax/parser/epub_parser.py +0 -41
  43. pydatamax-0.1.16.post1/datamax/parser/html_parser.py +0 -38
  44. pydatamax-0.1.16.post1/datamax/parser/image_parser.py +0 -34
  45. pydatamax-0.1.16.post1/datamax/parser/json_parser.py +0 -32
  46. pydatamax-0.1.16.post1/datamax/parser/md_parser.py +0 -73
  47. pydatamax-0.1.16.post1/datamax/parser/pdf_parser.py +0 -101
  48. pydatamax-0.1.16.post1/datamax/parser/pptx_parser.py +0 -45
  49. pydatamax-0.1.16.post1/datamax/parser/txt_parser.py +0 -46
  50. pydatamax-0.1.16.post1/datamax/parser/xls_parser.py +0 -26
  51. pydatamax-0.1.16.post1/datamax/utils/qa_generator.py +0 -369
  52. pydatamax-0.1.16.post1/datamax/utils/tokenizer.py +0 -22
  53. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/LICENSE +0 -0
  54. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/__init__.py +0 -0
  55. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/loader/__init__.py +0 -0
  56. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/loader/oss_handler.py +0 -0
  57. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/utils/constants.py +0 -0
  58. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/pydatamax.egg-info/dependency_links.txt +0 -0
  59. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/pydatamax.egg-info/requires.txt +0 -0
  60. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/pydatamax.egg-info/top_level.txt +0 -0
  61. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/setup.cfg +0 -0
  62. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/tests/test_docx_format_analysis.py +0 -0
  63. {pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/tests/test_wps_doc.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydatamax
3
- Version: 0.1.16.post1
3
+ Version: 0.1.16.post2
4
4
  Summary: A library for parsing and converting various file formats.
5
5
  Home-page: https://github.com/Hi-Dolphin/datamax
6
6
  Author: ccy
@@ -113,7 +113,7 @@ qa_data = dm.get_pre_label(
113
113
  question_number=5, # 每块生成问题数
114
114
  max_workers=5 # 并发数
115
115
  )
116
- dm.save_label_data(res)
116
+ dm.save_label_data(qa_data)
117
117
  ```
118
118
 
119
119
  ## 📖 Detailed Documentation
@@ -316,6 +316,58 @@ pip install -r requirements.txt
316
316
  python setup.py install
317
317
  ```
318
318
 
319
+ ### Developer Mode
320
+
321
+ For developers who want to contribute to the project or make modifications, we recommend using developer mode for a better development experience.
322
+
323
+ #### Setup Developer Mode
324
+
325
+ ```bash
326
+ # Clone the repository
327
+ git clone https://github.com/Hi-Dolphin/datamax.git
328
+ cd datamax
329
+
330
+ # Create virtual environment (recommended)
331
+ python -m venv venv
332
+ source venv/bin/activate # On Windows: venv\Scripts\activate
333
+
334
+ # Install in developer mode
335
+ pip install -e .
336
+ ```
337
+
338
+ #### Benefits of Developer Mode
339
+
340
+ - **Live Updates**: Changes to source code are immediately reflected without reinstallation
341
+ - **Easy Testing**: Test your modifications instantly
342
+ - **Debugging**: Better debugging experience with direct access to source code
343
+ - **Development Workflow**: Seamless integration with your development environment
344
+
345
+ #### Development Commands
346
+
347
+ ```bash
348
+ # Run tests
349
+ pytest
350
+
351
+ # Install development dependencies
352
+ pip install -r requirements-dev.txt # if available
353
+
354
+ # Check code style
355
+ flake8 datamax/
356
+ black datamax/
357
+
358
+ # Build package
359
+ python setup.py sdist bdist_wheel
360
+ ```
361
+
362
+ #### Making Changes
363
+
364
+ After installing in developer mode, you can:
365
+
366
+ 1. Edit source code in the `datamax/` directory
367
+ 2. Changes are automatically available when you import the module
368
+ 3. Test your changes immediately without reinstalling
369
+ 4. Submit pull requests with your improvements
370
+
319
371
  ## 📋 System Requirements
320
372
 
321
373
  - Python >= 3.10
@@ -52,7 +52,7 @@ qa_data = dm.get_pre_label(
52
52
  question_number=5, # 每块生成问题数
53
53
  max_workers=5 # 并发数
54
54
  )
55
- dm.save_label_data(res)
55
+ dm.save_label_data(qa_data)
56
56
  ```
57
57
 
58
58
  ## 📖 Detailed Documentation
@@ -255,6 +255,58 @@ pip install -r requirements.txt
255
255
  python setup.py install
256
256
  ```
257
257
 
258
+ ### Developer Mode
259
+
260
+ For developers who want to contribute to the project or make modifications, we recommend using developer mode for a better development experience.
261
+
262
+ #### Setup Developer Mode
263
+
264
+ ```bash
265
+ # Clone the repository
266
+ git clone https://github.com/Hi-Dolphin/datamax.git
267
+ cd datamax
268
+
269
+ # Create virtual environment (recommended)
270
+ python -m venv venv
271
+ source venv/bin/activate # On Windows: venv\Scripts\activate
272
+
273
+ # Install in developer mode
274
+ pip install -e .
275
+ ```
276
+
277
+ #### Benefits of Developer Mode
278
+
279
+ - **Live Updates**: Changes to source code are immediately reflected without reinstallation
280
+ - **Easy Testing**: Test your modifications instantly
281
+ - **Debugging**: Better debugging experience with direct access to source code
282
+ - **Development Workflow**: Seamless integration with your development environment
283
+
284
+ #### Development Commands
285
+
286
+ ```bash
287
+ # Run tests
288
+ pytest
289
+
290
+ # Install development dependencies
291
+ pip install -r requirements-dev.txt # if available
292
+
293
+ # Check code style
294
+ flake8 datamax/
295
+ black datamax/
296
+
297
+ # Build package
298
+ python setup.py sdist bdist_wheel
299
+ ```
300
+
301
+ #### Making Changes
302
+
303
+ After installing in developer mode, you can:
304
+
305
+ 1. Edit source code in the `datamax/` directory
306
+ 2. Changes are automatically available when you import the module
307
+ 3. Test your changes immediately without reinstalling
308
+ 4. Submit pull requests with your improvements
309
+
258
310
  ## 📋 System Requirements
259
311
 
260
312
  - Python >= 3.10
@@ -0,0 +1,144 @@
1
+ import os
2
+ from typing import List
3
+
4
+ from datamax.loader.minio_handler import MinIOClient
5
+ from datamax.loader.oss_handler import OssClient
6
+
7
+
8
+ class DataLoader:
9
+ def __init__(
10
+ self,
11
+ endpoint: str = None,
12
+ secret_key: str = None,
13
+ access_key: str = None,
14
+ bucket_name: str = None,
15
+ source: str = None,
16
+ ):
17
+ if source and source == "Oss":
18
+ self.oss = OssClient(
19
+ oss_endpoint=endpoint,
20
+ oss_access_key_secret=secret_key,
21
+ oss_access_key_id=access_key,
22
+ oss_bucket_name=bucket_name,
23
+ )
24
+ elif source and source == "MinIO":
25
+ self.mi = MinIOClient(
26
+ endpoint=endpoint,
27
+ secret_key=secret_key,
28
+ access_key=access_key,
29
+ bucket_name=bucket_name,
30
+ )
31
+ self.download_path = str("./download_file")
32
+ self.source = source
33
+ self.bucket_name = bucket_name
34
+
35
+ @staticmethod
36
+ def load_from_file(local_file_path) -> List[str]:
37
+ if os.path.isfile(local_file_path):
38
+ if os.path.exists(local_file_path):
39
+ if os.access(local_file_path, os.R_OK):
40
+ return [local_file_path]
41
+ else:
42
+ return []
43
+ else:
44
+ return []
45
+ elif os.path.isdir(local_file_path):
46
+ access_path = []
47
+ # Recursively process all files and subdirectories under the current directory.
48
+ for item in os.listdir(local_file_path):
49
+ item_path = os.path.join(local_file_path, item)
50
+ item_results = DataLoader.load_from_file(item_path)
51
+ access_path.extend(item_results)
52
+ return access_path
53
+ else:
54
+ return []
55
+
56
+ def load_from_oss_source(self, oss_path: str) -> List[str]:
57
+ if not os.path.exists(self.download_path):
58
+ os.makedirs(self.download_path)
59
+
60
+ self.download(oss_path=oss_path)
61
+
62
+ file_list = []
63
+ for root, dirs, files in os.walk(self.download_path):
64
+ for file in files:
65
+ file_path = os.path.join(self.download_path, file)
66
+ file_list.append(file_path)
67
+
68
+ success_file_list = []
69
+ for file_path in file_list:
70
+ if self.load_from_file(file_path):
71
+ success_file_list.append(file_path)
72
+
73
+ return success_file_list
74
+
75
+ def download(self, oss_path: str):
76
+ if self.source == "MinIO":
77
+ file_list = self.mi.list_objects(
78
+ bucket_name=self.bucket_name, prefix=oss_path
79
+ )
80
+ for path in file_list:
81
+ self.mi.download_file(
82
+ bucket_name=self.bucket_name,
83
+ object_name=path,
84
+ file_path=f'{self.download_path}/{path.split("/")[-1]}',
85
+ )
86
+ elif self.source == "Oss":
87
+ keys = self.oss.get_objects_in_folders(prefix=oss_path)
88
+ for path in keys:
89
+ self.oss.get_object_to_file(
90
+ object_name=path,
91
+ file_path=f'{self.download_path}/{path.split("/")[-1]}',
92
+ )
93
+
94
+ def upload(self, local_file_path: str, save_prefix: str):
95
+ if self.source == "MinIO":
96
+ if os.path.isdir(local_file_path):
97
+ for root, dirs, files in os.walk(local_file_path):
98
+ for file in files:
99
+ file_path = os.path.join(root, file)
100
+ self.mi.upload_file(
101
+ bucket_name=self.bucket_name,
102
+ object_name=save_prefix + f"{file}",
103
+ file_path=file_path,
104
+ )
105
+ elif os.path.isfile(local_file_path):
106
+ self.mi.upload_file(
107
+ bucket_name=self.bucket_name,
108
+ object_name=save_prefix + os.path.basename(local_file_path),
109
+ file_path=local_file_path,
110
+ )
111
+ else:
112
+ pass
113
+
114
+ elif self.source == "Oss":
115
+ if os.path.isdir(local_file_path):
116
+ self.oss.put_object_from_folder(
117
+ object_folder_name=save_prefix, local_folder_path=local_file_path
118
+ )
119
+ elif os.path.isfile(local_file_path):
120
+ self.oss.put_object_from_file(
121
+ object_name=save_prefix + os.path.basename(local_file_path),
122
+ file_path=local_file_path,
123
+ )
124
+ else:
125
+ pass
126
+
127
+ def share(
128
+ self,
129
+ oss_path: str,
130
+ expires: int = None,
131
+ aliyun_oss_url_prefix: str = None,
132
+ csnt_url_prefix: str = None,
133
+ ):
134
+ if self.source == "MinIO":
135
+ return self.mi.get_object_tmp_link(
136
+ bucket_name=self.bucket_name, object_name=oss_path, expires=expires
137
+ )
138
+ elif self.source == "Oss":
139
+ return self.oss.get_oss_url(
140
+ object_name=oss_path,
141
+ url_expires_time=expires,
142
+ aliyun_oss_url_prefix=aliyun_oss_url_prefix,
143
+ csnt_url_prefix=csnt_url_prefix,
144
+ )
@@ -1,11 +1,12 @@
1
1
  import os
2
- from dotenv import load_dotenv
2
+ import re
3
3
  from datetime import timedelta
4
+
5
+ from dotenv import load_dotenv
6
+ from loguru import logger
4
7
  from minio import Minio
5
8
  from minio.commonconfig import Tags
6
9
  from minio.error import S3Error
7
- from loguru import logger
8
- import re
9
10
 
10
11
  load_dotenv()
11
12
 
@@ -25,7 +26,7 @@ class MinIOClient:
25
26
  self.endpoint,
26
27
  access_key=self.access_key,
27
28
  secret_key=self.secret_key,
28
- secure=self.secure
29
+ secure=self.secure,
29
30
  )
30
31
  return client
31
32
  except S3Error as e:
@@ -55,7 +56,9 @@ class MinIOClient:
55
56
  if self.client:
56
57
  try:
57
58
  self.client.fput_object(bucket_name, object_name, file_path)
58
- logger.info(f"File '{file_path}' uploaded to bucket '{bucket_name}' as '{object_name}'.")
59
+ logger.info(
60
+ f"File '{file_path}' uploaded to bucket '{bucket_name}' as '{object_name}'."
61
+ )
59
62
  except S3Error as e:
60
63
  raise
61
64
 
@@ -63,15 +66,18 @@ class MinIOClient:
63
66
  if self.client:
64
67
  try:
65
68
  self.client.fget_object(bucket_name, object_name, file_path)
66
- logger.info(f"Object '{object_name}' from bucket '{bucket_name}' downloaded to '{file_path}'.")
69
+ logger.info(
70
+ f"Object '{object_name}' from bucket '{bucket_name}' downloaded to '{file_path}'."
71
+ )
67
72
  return file_path
68
73
  except Exception as e:
69
74
  try:
70
75
  illegal_chars = r'[\/:*?"<>|]'
71
- file_path = re.sub(illegal_chars, '_', file_path)
76
+ file_path = re.sub(illegal_chars, "_", file_path)
72
77
  self.client.fget_object(bucket_name, object_name, file_path)
73
78
  logger.info(
74
- f"Object {object_name} from bucket {bucket_name} downloaded to {file_path}'.")
79
+ f"Object {object_name} from bucket {bucket_name} downloaded to {file_path}'."
80
+ )
75
81
  return file_path
76
82
  except Exception as e:
77
83
  raise
@@ -81,7 +87,9 @@ class MinIOClient:
81
87
  try:
82
88
  result_list = []
83
89
  if prefix:
84
- objects = self.client.list_objects(bucket_name, recursive=True, prefix=prefix)
90
+ objects = self.client.list_objects(
91
+ bucket_name, recursive=True, prefix=prefix
92
+ )
85
93
  else:
86
94
  objects = self.client.list_objects(bucket_name, recursive=True)
87
95
  logger.info(f"Objects in bucket '{bucket_name}':")
@@ -99,8 +107,7 @@ class MinIOClient:
99
107
  raise
100
108
 
101
109
  def calculate_bucket_stats(self, bucket_name, prefix):
102
- objects = self.client.list_objects(bucket_name,
103
- prefix=prefix, recursive=True)
110
+ objects = self.client.list_objects(bucket_name, prefix=prefix, recursive=True)
104
111
  total_size = 0
105
112
  object_count = 0
106
113
 
@@ -115,14 +122,16 @@ class MinIOClient:
115
122
  def get_objects(self, bucket_name, object_name):
116
123
  try:
117
124
  response = self.client.get_object(bucket_name, object_name)
118
- content = response.read().decode('utf-8')
125
+ content = response.read().decode("utf-8")
119
126
  return content
120
127
  except Exception as e:
121
128
  raise
122
129
 
123
130
  def get_object_tag(self, bucket_name, object_name):
124
131
  try:
125
- tags = self.client.get_object_tags(bucket_name=bucket_name, object_name=object_name)
132
+ tags = self.client.get_object_tags(
133
+ bucket_name=bucket_name, object_name=object_name
134
+ )
126
135
  return tags
127
136
  except Exception as e:
128
137
  raise
@@ -130,7 +139,9 @@ class MinIOClient:
130
139
  def update_object_tag(self, bucket_name, object_name, tags):
131
140
  try:
132
141
  tags_obj = Tags.new_object_tags()
133
- tag_info = self.get_object_tag(bucket_name=bucket_name, object_name=object_name)
142
+ tag_info = self.get_object_tag(
143
+ bucket_name=bucket_name, object_name=object_name
144
+ )
134
145
  if tag_info is None:
135
146
  tag_info = {}
136
147
  for tag_dict in tags:
@@ -142,7 +153,9 @@ class MinIOClient:
142
153
 
143
154
  for k, v in tag_info.items():
144
155
  tags_obj[k] = v
145
- self.client.set_object_tags(bucket_name=bucket_name, object_name=object_name, tags=tags_obj)
156
+ self.client.set_object_tags(
157
+ bucket_name=bucket_name, object_name=object_name, tags=tags_obj
158
+ )
146
159
  else:
147
160
  for tag_dict in tags:
148
161
  for tag_key, tag_value in tag_dict.items():
@@ -153,20 +166,26 @@ class MinIOClient:
153
166
 
154
167
  for k, v in tag_info.items():
155
168
  tags_obj[k] = v
156
- self.client.set_object_tags(bucket_name=bucket_name, object_name=object_name, tags=tags_obj)
169
+ self.client.set_object_tags(
170
+ bucket_name=bucket_name, object_name=object_name, tags=tags_obj
171
+ )
157
172
  return tag_info
158
173
  except Exception as e:
159
174
  raise
160
175
 
161
176
  def reset_object_tag(self, bucket_name, object_name):
162
177
  try:
163
- self.client.delete_object_tags(bucket_name=bucket_name, object_name=object_name)
178
+ self.client.delete_object_tags(
179
+ bucket_name=bucket_name, object_name=object_name
180
+ )
164
181
  return True
165
182
  except Exception as e:
166
183
  raise
167
184
 
168
185
  def get_object_tmp_link(self, bucket_name, object_name, expires):
169
186
  try:
170
- return self.client.presigned_get_object(bucket_name, object_name, expires=timedelta(days=expires))
187
+ return self.client.presigned_get_object(
188
+ bucket_name, object_name, expires=timedelta(days=expires)
189
+ )
171
190
  except Exception as e:
172
- raise
191
+ raise
@@ -0,0 +1,3 @@
1
+ from loguru import logger
2
+
3
+ from .core import DataMax
@@ -0,0 +1,101 @@
1
+ import os
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+ from typing import Dict, List, Union
5
+
6
+ from datamax.utils.lifecycle_types import LifeType
7
+ from datamax.utils.tokenizer import DashScopeClient
8
+
9
+
10
+ class LifeCycle:
11
+ """
12
+ Life cycle class
13
+ """
14
+
15
+ def __init__(
16
+ self, update_time: str, life_type: list, life_metadata: Dict[str, str]
17
+ ):
18
+ self.update_time = update_time # Update time
19
+ self.life_type = life_type # Life cycle type
20
+ self.life_metadata = life_metadata # Life cycle metadata
21
+
22
+ def update(self, update_time: str, life_type: list, life_metadata: Dict[str, str]):
23
+ self.update_time = update_time
24
+ self.life_type = life_type
25
+ self.life_metadata.update(life_metadata)
26
+
27
+ def __str__(self):
28
+ metadata_str = ", ".join(f"{k}: {v}" for k, v in self.life_metadata.items())
29
+ return f"update_time: {self.update_time}, life_type: {self.life_type}, life_metadata: {{{metadata_str}}}"
30
+
31
+ def to_dict(self):
32
+ return {
33
+ "update_time": self.update_time,
34
+ "life_type": self.life_type,
35
+ "life_metadata": self.life_metadata,
36
+ }
37
+
38
+
39
+ class MarkdownOutputVo:
40
+ """
41
+ Markdown output conversion
42
+ """
43
+
44
+ def __init__(self, extension: str, content: str):
45
+ self.extension: str = extension # File type
46
+ self.content: str = content # Markdown content
47
+ self.lifecycle: List[LifeCycle] = [] # Life cycle data
48
+
49
+ def add_lifecycle(self, lifecycle: LifeCycle):
50
+ self.lifecycle.append(lifecycle)
51
+
52
+ def to_dict(self):
53
+ data_dict = {
54
+ "extension": self.extension,
55
+ "content": self.content,
56
+ "lifecycle": [lc.to_dict() for lc in self.lifecycle],
57
+ }
58
+ return data_dict
59
+
60
+
61
+ class BaseLife:
62
+ tk_client = DashScopeClient()
63
+
64
+ @staticmethod
65
+ def generate_lifecycle(
66
+ source_file: str,
67
+ domain: str,
68
+ life_type: Union[LifeType, str, List[Union[LifeType, str]]],
69
+ usage_purpose: str,
70
+ ) -> LifeCycle:
71
+ """
72
+ 构造一个 LifeCycle 记录,可以传入单个枚举/字符串或列表混合
73
+ """
74
+ # 1) 先统一成 list
75
+ if isinstance(life_type, (list, tuple)):
76
+ raw = list(life_type)
77
+ else:
78
+ raw = [life_type]
79
+
80
+ # 2) 如果是枚举,就取它的 value
81
+ life_list: List[str] = [
82
+ lt.value if isinstance(lt, LifeType) else lt for lt in raw
83
+ ]
84
+
85
+ update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
86
+ try:
87
+ storage = os.path.getsize(source_file)
88
+ except Exception:
89
+ storage = 0
90
+ life_metadata = {
91
+ "storage_size": storage,
92
+ "source_file": source_file,
93
+ "domain": domain,
94
+ "usage_purpose": usage_purpose,
95
+ }
96
+ return LifeCycle(update_time, life_list, life_metadata)
97
+
98
+ @staticmethod
99
+ def get_file_extension(file_path):
100
+ file_path = Path(file_path)
101
+ return file_path.suffix[1:].lower()