pydatamax 0.1.16__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. datamax/loader/core.py +67 -42
  2. datamax/loader/minio_handler.py +38 -19
  3. datamax/parser/__init__.py +2 -1
  4. datamax/parser/base.py +46 -22
  5. datamax/parser/core.py +215 -126
  6. datamax/parser/csv_parser.py +25 -5
  7. datamax/parser/doc_parser.py +230 -141
  8. datamax/parser/docx_parser.py +275 -186
  9. datamax/parser/epub_parser.py +49 -13
  10. datamax/parser/html_parser.py +36 -16
  11. datamax/parser/image_parser.py +52 -14
  12. datamax/parser/json_parser.py +26 -5
  13. datamax/parser/md_parser.py +40 -21
  14. datamax/parser/pdf_parser.py +69 -29
  15. datamax/parser/ppt_parser.py +41 -9
  16. datamax/parser/pptx_parser.py +49 -21
  17. datamax/parser/txt_parser.py +45 -14
  18. datamax/parser/xls_parser.py +34 -6
  19. datamax/parser/xlsx_parser.py +58 -51
  20. datamax/utils/__init__.py +2 -1
  21. datamax/utils/data_cleaner.py +36 -22
  22. datamax/utils/env_setup.py +25 -18
  23. datamax/utils/gotocr_pdf.py +13 -13
  24. datamax/utils/lifecycle_types.py +18 -0
  25. datamax/utils/mineru_operator.py +17 -15
  26. datamax/utils/paddleocr_pdf_operator.py +34 -19
  27. datamax/utils/ppt_extract.py +34 -11
  28. datamax/utils/qa_generator.py +332 -44
  29. datamax/utils/tokenizer.py +10 -9
  30. datamax/utils/uno_handler.py +91 -68
  31. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
  32. pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
  33. pydatamax-0.1.16.dist-info/RECORD +0 -38
  34. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
  35. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
  36. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,13 @@
1
- import os
2
1
  from typing import Union
2
+
3
+ from loguru import logger
3
4
  from pptx import Presentation
4
- from datamax.parser.base import BaseLife
5
- from datamax.parser.base import MarkdownOutputVo
5
+
6
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
7
+ from datamax.utils.lifecycle_types import LifeType
6
8
 
7
9
 
8
- class PPtxParser(BaseLife):
10
+ class PptxParser(BaseLife):
9
11
  def __init__(self, file_path: Union[str, list]):
10
12
  super().__init__()
11
13
  self.file_path = file_path
@@ -13,33 +15,59 @@ class PPtxParser(BaseLife):
13
15
  @staticmethod
14
16
  def read_ppt_file(file_path: str):
15
17
  try:
16
- content = ''
18
+ content = ""
17
19
  prs = Presentation(file_path)
18
20
  for slide in prs.slides:
19
21
  for shape in slide.shapes:
20
22
  if shape.has_text_frame:
21
- content += shape.text + '\n'
22
- # if shape.shape_type == 13:
23
- # if not os.path.exists("extracted_images"):
24
- # os.makedirs("extracted_images")
25
- # image = shape.image
26
- # image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
27
- # with open(image_filename, 'wb') as img_file:
28
- # img_file.write(image.blob)
29
- # content += ('[' + image_filename + ']')
23
+ content += shape.text + "\n"
30
24
  return content
31
25
  except Exception:
32
26
  raise
33
27
 
34
28
  def parse(self, file_path: str) -> MarkdownOutputVo:
29
+ # —— 生命周期:开始处理 PPTX —— #
30
+ lc_start = self.generate_lifecycle(
31
+ source_file=file_path,
32
+ domain="Technology",
33
+ usage_purpose="Documentation",
34
+ life_type=LifeType.DATA_PROCESSING,
35
+ )
36
+ logger.debug("⚙️ DATA_PROCESSING 生命周期已生成")
37
+
35
38
  try:
36
- title = os.path.splitext(os.path.basename(file_path))[0]
39
+ extension = self.get_file_extension(file_path)
37
40
  content = self.read_ppt_file(file_path=file_path)
38
41
  mk_content = content
39
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
40
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
41
- output_vo = MarkdownOutputVo(title, mk_content)
42
- output_vo.add_lifecycle(lifecycle)
42
+
43
+ # —— 生命周期:处理完成 —— #
44
+ lc_end = self.generate_lifecycle(
45
+ source_file=file_path,
46
+ domain="Technology",
47
+ usage_purpose="Documentation",
48
+ life_type=LifeType.DATA_PROCESSED,
49
+ )
50
+ logger.debug("⚙️ DATA_PROCESSED 生命周期已生成")
51
+
52
+ output_vo = MarkdownOutputVo(extension, mk_content)
53
+ output_vo.add_lifecycle(lc_start)
54
+ output_vo.add_lifecycle(lc_end)
43
55
  return output_vo.to_dict()
44
- except Exception:
45
- raise
56
+
57
+ except Exception as e:
58
+ # —— 生命周期:处理失败 —— #
59
+ lc_fail = self.generate_lifecycle(
60
+ source_file=file_path,
61
+ domain="Technology",
62
+ usage_purpose="Documentation",
63
+ life_type=LifeType.DATA_PROCESS_FAILED,
64
+ )
65
+ logger.debug("⚙️ DATA_PROCESS_FAILED 生命周期已生成")
66
+
67
+ raise Exception(
68
+ {
69
+ "error": str(e),
70
+ "file_path": file_path,
71
+ "lifecycle": [lc_fail.to_dict()],
72
+ }
73
+ )
@@ -1,8 +1,10 @@
1
- import chardet
2
1
  from typing import Union
3
- from datamax.parser.base import BaseLife
4
- from datamax.parser.base import MarkdownOutputVo
5
- import os
2
+
3
+ import chardet
4
+
5
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
6
+ from datamax.utils.lifecycle_types import LifeType
7
+
6
8
 
7
9
  class TxtParser(BaseLife):
8
10
  def __init__(self, file_path: Union[str, list]):
@@ -12,9 +14,9 @@ class TxtParser(BaseLife):
12
14
  @staticmethod
13
15
  def detect_encoding(file_path: str):
14
16
  try:
15
- with open(file_path, 'rb') as f:
17
+ with open(file_path, "rb") as f:
16
18
  result = chardet.detect(f.read())
17
- return result['encoding']
19
+ return result["encoding"]
18
20
  except Exception as e:
19
21
  raise e
20
22
 
@@ -27,20 +29,49 @@ class TxtParser(BaseLife):
27
29
  """
28
30
  try:
29
31
  encoding = TxtParser.detect_encoding(file_path)
30
- with open(file_path, 'r', encoding=encoding) as file:
32
+ with open(file_path, "r", encoding=encoding) as file:
31
33
  return file.read()
32
34
  except Exception as e:
33
35
  raise e
34
36
 
35
37
  def parse(self, file_path: str) -> MarkdownOutputVo:
36
38
  try:
37
- title = os.path.splitext(os.path.basename(file_path))[0]
38
- content = self.read_txt_file(file_path=file_path) # 真实数据是从load加载
39
+ extension = self.get_file_extension(file_path)
40
+
41
+ # 1) 开始处理
42
+ lc_start = self.generate_lifecycle(
43
+ source_file=file_path,
44
+ domain="Technology",
45
+ usage_purpose="Documentation",
46
+ life_type=LifeType.DATA_PROCESSING,
47
+ )
48
+
49
+ # 2) 读取文件内容
50
+ content = self.read_txt_file(file_path=file_path)
39
51
  mk_content = content
40
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
41
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
42
- output_vo = MarkdownOutputVo(title, mk_content)
43
- output_vo.add_lifecycle(lifecycle)
52
+
53
+ # 3) 构造输出对象并加上开始生命周期
54
+ output_vo = MarkdownOutputVo(extension, mk_content)
55
+ output_vo.add_lifecycle(lc_start)
56
+
57
+ # 4) 处理完成
58
+ lc_end = self.generate_lifecycle(
59
+ source_file=file_path,
60
+ domain="Technology",
61
+ usage_purpose="Documentation",
62
+ life_type=LifeType.DATA_PROCESSED,
63
+ )
64
+ output_vo.add_lifecycle(lc_end)
65
+
44
66
  return output_vo.to_dict()
67
+
45
68
  except Exception as e:
46
- raise e
69
+ # 5) 处理失败
70
+ lc_fail = self.generate_lifecycle(
71
+ source_file=file_path,
72
+ domain="Technology",
73
+ usage_purpose="Documentation",
74
+ life_type=LifeType.DATA_PROCESS_FAILED,
75
+ )
76
+ # (可选)如果希望在失败时也返回 VO,可在这里构造空 content 的 VO 并加入 lc_fail
77
+ raise
@@ -1,8 +1,10 @@
1
- from datamax.parser.base import MarkdownOutputVo
2
- from datamax.parser.base import BaseLife
3
- import pandas as pd
4
1
  import warnings
5
2
 
3
+ import pandas as pd
4
+
5
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
6
+ from datamax.utils.lifecycle_types import LifeType
7
+
6
8
  warnings.filterwarnings("ignore")
7
9
 
8
10
 
@@ -15,12 +17,38 @@ class XlsParser(BaseLife):
15
17
 
16
18
  def parse(self, file_path: str) -> MarkdownOutputVo:
17
19
  try:
20
+ # 🏷️ 解析开始
21
+ lc_start = self.generate_lifecycle(
22
+ source_file=file_path,
23
+ domain="Technology",
24
+ usage_purpose="Documentation",
25
+ life_type=LifeType.DATA_PROCESSING,
26
+ )
27
+
28
+ # 📊 读取Excel并生成Markdown
18
29
  df = pd.read_excel(file_path)
19
30
  mk_content = df.to_markdown(index=False)
20
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
21
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
31
+
32
+ # 🏷️ 解析完成
33
+ lc_end = self.generate_lifecycle(
34
+ source_file=file_path,
35
+ domain="Technology",
36
+ usage_purpose="Documentation",
37
+ life_type=LifeType.DATA_PROCESSED,
38
+ )
39
+
22
40
  output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
23
- output_vo.add_lifecycle(lifecycle)
41
+ output_vo.add_lifecycle(lc_start)
42
+ output_vo.add_lifecycle(lc_end)
24
43
  return output_vo.to_dict()
44
+
25
45
  except Exception as e:
46
+ # ❌ 解析失败
47
+ lc_fail = self.generate_lifecycle(
48
+ source_file=file_path,
49
+ domain="Technology",
50
+ usage_purpose="Documentation",
51
+ life_type=LifeType.DATA_PROCESS_FAILED,
52
+ )
53
+ # 此处不返回空VO,直接抛出,框架可捕获并上报
26
54
  raise e
@@ -1,4 +1,3 @@
1
- from loguru import logger
2
1
  import multiprocessing
3
2
  import os
4
3
  import time
@@ -6,8 +5,10 @@ import warnings
6
5
  from multiprocessing import Queue
7
6
 
8
7
  import pandas as pd
8
+ from loguru import logger
9
9
 
10
10
  from datamax.parser.base import BaseLife, MarkdownOutputVo
11
+ from datamax.utils.lifecycle_types import LifeType
11
12
 
12
13
  warnings.filterwarnings("ignore")
13
14
 
@@ -15,11 +16,10 @@ warnings.filterwarnings("ignore")
15
16
  class XlsxParser(BaseLife):
16
17
  """XLSX解析器 - 使用pandas读取并转换为markdown,支持多进程处理"""
17
18
 
18
- def __init__(self, file_path, timeout):
19
+ def __init__(self, file_path):
19
20
  super().__init__()
20
21
  self.file_path = file_path
21
- self.timeout = timeout
22
- logger.info(f"🚀 XlsxParser初始化完成 - 文件路径: {file_path}, 超时: {timeout}s")
22
+ logger.info(f"🚀 XlsxParser初始化完成 - 文件路径: {file_path}")
23
23
 
24
24
  def _parse_with_pandas(self, file_path: str) -> str:
25
25
  """使用pandas读取Excel并转换为markdown"""
@@ -85,7 +85,9 @@ class XlsxParser(BaseLife):
85
85
  markdown_content = "*工作表为空*"
86
86
  logger.warning("⚠️ 工作表为空")
87
87
 
88
- logger.info(f"🎊 pandas转换完成,markdown内容长度: {len(markdown_content)} 字符")
88
+ logger.info(
89
+ f"🎊 pandas转换完成,markdown内容长度: {len(markdown_content)} 字符"
90
+ )
89
91
  logger.debug(f"👀 前200字符预览: {markdown_content[:200]}...")
90
92
 
91
93
  return markdown_content
@@ -107,6 +109,15 @@ class XlsxParser(BaseLife):
107
109
  """解析Excel文件的核心方法"""
108
110
  logger.info(f"🎬 开始解析Excel文件: {file_path}")
109
111
 
112
+ # —— 生命周期:开始处理 —— #
113
+ lc_start = self.generate_lifecycle(
114
+ source_file=file_path,
115
+ domain="Technology",
116
+ usage_purpose="Documentation",
117
+ life_type=LifeType.DATA_PROCESSING,
118
+ )
119
+ logger.debug("⚙️ DATA_PROCESSING 生命周期已生成")
120
+
110
121
  try:
111
122
  # 使用pandas解析Excel
112
123
  logger.info("🐼 使用pandas模式解析Excel")
@@ -119,19 +130,20 @@ class XlsxParser(BaseLife):
119
130
 
120
131
  logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
121
132
 
122
- # 生成lifecycle信息
123
- lifecycle = self.generate_lifecycle(
133
+ # —— 生命周期:处理完成 —— #
134
+ lc_end = self.generate_lifecycle(
124
135
  source_file=file_path,
125
136
  domain="Technology",
126
137
  usage_purpose="Documentation",
127
- life_type="LLM_ORIGIN",
138
+ life_type=LifeType.DATA_PROCESSED,
128
139
  )
129
- logger.debug("⚙️ 生成lifecycle信息完成")
140
+ logger.debug("⚙️ DATA_PROCESSED 生命周期已生成")
130
141
 
131
- # 创建输出对象
132
- title = os.path.splitext(os.path.basename(file_path))[0]
133
- output_vo = MarkdownOutputVo(title, mk_content)
134
- output_vo.add_lifecycle(lifecycle)
142
+ # 创建输出对象并添加两个生命周期
143
+ extension = self.get_file_extension(file_path)
144
+ output_vo = MarkdownOutputVo(extension, mk_content)
145
+ output_vo.add_lifecycle(lc_start)
146
+ output_vo.add_lifecycle(lc_end)
135
147
 
136
148
  result = output_vo.to_dict()
137
149
  result_queue.put(result)
@@ -142,15 +154,46 @@ class XlsxParser(BaseLife):
142
154
  return result
143
155
 
144
156
  except Exception as e:
157
+ # —— 生命周期:处理失败 —— #
158
+ try:
159
+ lc_fail = self.generate_lifecycle(
160
+ source_file=file_path,
161
+ domain="Technology",
162
+ usage_purpose="Documentation",
163
+ life_type=LifeType.DATA_PROCESS_FAILED,
164
+ )
165
+ logger.debug("⚙️ DATA_PROCESS_FAILED 生命周期已生成")
166
+ # 如果需要,也可以把它加到 error_result 里:
167
+ # error_result = {"error": str(e), "file_path": file_path, "lifecycle":[lc_fail.to_dict()]}
168
+ except Exception:
169
+ pass
170
+
171
+ # —— 生命周期:处理失败 —— #
172
+ try:
173
+ lc_fail = self.generate_lifecycle(
174
+ source_file=file_path,
175
+ domain="Technology",
176
+ usage_purpose="Documentation",
177
+ life_type=LifeType.DATA_PROCESS_FAILED,
178
+ )
179
+ logger.debug("⚙️ DATA_PROCESS_FAILED 生命周期已生成")
180
+ except Exception:
181
+ pass
182
+
145
183
  logger.error(f"💀 解析Excel文件失败: {file_path}, 错误: {str(e)}")
146
184
  # 将错误也放入队列
147
- error_result = {"error": str(e), "file_path": file_path}
185
+ error_result = {
186
+ "error": str(e),
187
+ "file_path": file_path,
188
+ # 额外把失败的 lifecycle 也一起返回,测试中可选校验
189
+ "lifecycle": [lc_fail.to_dict()] if "lc_fail" in locals() else [],
190
+ }
148
191
  result_queue.put(error_result)
149
192
  raise
150
193
 
151
194
  def parse(self, file_path: str) -> dict:
152
195
  """解析Excel文件 - 支持多进程和超时控制"""
153
- logger.info(f"🚀 启动Excel解析进程 - 文件: {file_path}, 超时: {self.timeout}s")
196
+ logger.info(f"🚀 启动Excel解析进程 - 文件: {file_path}")
154
197
 
155
198
  try:
156
199
  # 验证文件存在
@@ -169,42 +212,6 @@ class XlsxParser(BaseLife):
169
212
  process.start()
170
213
  logger.debug(f"⚡ 启动子进程,PID: {process.pid}")
171
214
 
172
- start_time = time.time()
173
-
174
- # 等待解析完成或超时
175
- while time.time() - start_time < self.timeout:
176
- elapsed_time = int(time.time() - start_time)
177
- logger.debug(f"⏱️ 等待解析完成... {elapsed_time}s")
178
-
179
- if not process.is_alive():
180
- logger.debug("✅ 子进程已完成")
181
- break
182
-
183
- if not result_queue.empty():
184
- result = result_queue.get()
185
- process.join() # 等待进程正常结束
186
-
187
- # 检查是否是错误结果
188
- if "error" in result:
189
- logger.error(f"💥 子进程返回错误: {result['error']}")
190
- raise Exception(result["error"])
191
-
192
- logger.info(f"🎉 Excel解析成功完成,耗时: {elapsed_time}s")
193
- return result
194
-
195
- time.sleep(1)
196
- else:
197
- # 超时处理
198
- logger.error(f"⏰ 解析超时 ({self.timeout}s),终止进程")
199
- process.terminate()
200
- process.join(timeout=5) # 给进程5秒时间优雅退出
201
-
202
- if process.is_alive():
203
- logger.error("💀 强制杀死进程")
204
- process.kill()
205
-
206
- raise TimeoutError(f"Excel解析超时: {file_path}")
207
-
208
215
  except Exception as e:
209
216
  logger.error(
210
217
  f"💀 Excel解析失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}"
datamax/utils/__init__.py CHANGED
@@ -5,7 +5,7 @@ from datamax.utils.data_cleaner import (
5
5
  )
6
6
  from datamax.utils.env_setup import setup_environment
7
7
 
8
- # 条件导入UNO处理器
8
+ # Conditionally import the UNO processor
9
9
  try:
10
10
  from datamax.utils.uno_handler import (
11
11
  HAS_UNO,
@@ -23,6 +23,7 @@ except ImportError:
23
23
  cleanup_uno_manager = None
24
24
  uno_manager_context = None
25
25
 
26
+
26
27
  def clean_original_text(text):
27
28
  """
28
29
  Clean the original text.
@@ -31,15 +31,15 @@ class AbnormalCleaner:
31
31
  """
32
32
  Extract reference entries and assign to self.parsed_data
33
33
  (Original text will be replaced with extracted references, each item on a separate line)
34
-
34
+
35
35
  Returns:
36
36
  str: Extracted reference text (same as self.parsed_data)
37
37
  """
38
38
  patterns = [
39
- r'([A-Z][a-z]+(?:, [A-Z](?:\.[a-z]*)?)+(?: et al\.)? $\d{4}$[^\n]+)', # APA format
40
- r'($$\d+$$[^\n]+)', # Numbered references like [1]
41
- r'(DOI:\s?\S+|https?://\S+)', # DOI/URL
42
- r'([A-Z][a-z]+, [A-Z]\.?,? & [A-Z][a-z]+, [A-Z]\. \d{4}[^\n]+)' # Multi-author APA
39
+ r"([A-Z][a-z]+(?:, [A-Z](?:\.[a-z]*)?)+(?: et al\.)? $\d{4}$[^\n]+)", # APA format
40
+ r"($$\d+$$[^\n]+)", # Numbered references like [1]
41
+ r"(DOI:\s?\S+|https?://\S+)", # DOI/URL
42
+ r"([A-Z][a-z]+, [A-Z]\.?,? & [A-Z][a-z]+, [A-Z]\. \d{4}[^\n]+)", # Multi-author APA
43
43
  ]
44
44
  references = []
45
45
  for pattern in patterns:
@@ -47,9 +47,11 @@ class AbnormalCleaner:
47
47
  references.extend(re.findall(pattern, self.parsed_data))
48
48
  except re.error as e:
49
49
  print(f"Regex error {pattern}: {e}")
50
-
50
+
51
51
  # Assign extraction results to parsed_data (each item on a separate line)
52
- self.parsed_data = "\n".join(list(set(references))) # Deduplicate and merge into string
52
+ self.parsed_data = "\n".join(
53
+ list(set(references))
54
+ ) # Deduplicate and merge into string
53
55
  return self.parsed_data
54
56
 
55
57
  # Exception cleaning class
@@ -164,19 +166,19 @@ class TextFilter:
164
166
  """Filter by word repetition rate"""
165
167
  if not isinstance(self.parsed_data, str):
166
168
  return False
167
-
169
+
168
170
  text = str(self.parsed_data)
169
- bi_grams = [text[i:i+2] for i in range(0, len(text)-1, 2)]
171
+ bi_grams = [text[i : i + 2] for i in range(0, len(text) - 1, 2)]
170
172
  word_count = len(bi_grams)
171
173
  if word_count == 0:
172
174
  print("No words found.")
173
175
  return False
174
-
176
+
175
177
  word_freq = Counter(bi_grams)
176
178
  most_common_word, most_common_count = word_freq.most_common(1)[0]
177
179
  repetition_rate = most_common_count / word_count
178
180
  print(f"Word repetition rate: {repetition_rate}")
179
-
181
+
180
182
  return repetition_rate <= threshold
181
183
 
182
184
  def filter_by_char_count(self, min_chars=30, max_chars=500000):
@@ -227,22 +229,34 @@ class PrivacyDesensitization:
227
229
  # Customer service hotlines are not easy to match and are not considered private data
228
230
  self.parsed_data = re.sub(r"\d+-\d+-\d+", token, self.parsed_data)
229
231
  return self.parsed_data
230
-
232
+
231
233
  def replace_bank_id(self, token="COSCO_NUMBER"):
232
234
  # Match bank card numbers and replace
233
- self.parsed_data = self.replace_bank_id(
234
- self.parsed_data, token=token
235
- )
235
+ BANK_ID_PATTERN = r"\b(?:(?:\d{4}[ -]?){4}\d{3}|(?:\d{4}[ -]?){3}\d{4}|(?:4\d{3}|5[1-5]\d{2}|6[045]\d{2})(?:[ -]?\d{4}){3}|3[47]\d{2}[ -]?\d{6}[ -]?\d{5})\b"
236
+
237
+ def luhn_check(card_number):
238
+ digits = [int(d) for d in card_number if d.isdigit()]
239
+ if len(digits) not in (13, 15, 16, 19):
240
+ return False
241
+ checksum = sum(digits[-1::-2])
242
+ checksum += sum(sum(divmod(d * 2, 10)) for d in digits[-2::-2])
243
+ return checksum % 10 == 0
244
+
245
+ bank_card_numbers = re.findall(BANK_ID_PATTERN, self.parsed_data)
246
+
247
+ for card_number in bank_card_numbers:
248
+ if luhn_check(card_number):
249
+ self.parsed_data = re.sub(card_number, token, self.parsed_data)
236
250
  return self.parsed_data
237
-
251
+
238
252
  def replace_phone_number(self, token="COSCO_NUMBER"):
239
253
  # Match phone numbers and replace
240
254
  self.parsed_data = jio.replace_phone_number(self.parsed_data, token)
241
255
  return self.parsed_data
242
-
256
+
243
257
  def replace_qq(self, token="COSCO_NUMBER"):
244
258
  # Match QQ numbers and replace
245
- self.parsed_data = jio.replace_qq(self.parsed_data,token)
259
+ self.parsed_data = jio.replace_qq(self.parsed_data, token)
246
260
  return self.parsed_data
247
261
 
248
262
  def replace_id_card(self, token="COSCO_NUMBER"):
@@ -252,6 +266,10 @@ class PrivacyDesensitization:
252
266
 
253
267
  def replace_number(self):
254
268
  # Replace all types of numeric private data
269
+ # Bank card
270
+ self.parsed_data = self.replace_bank_id(
271
+ token="BANK_ID"
272
+ ) # nosec B106 - 这是数据脱敏标记,不是密码
255
273
 
256
274
  # Landline + mobile phone
257
275
  self.parsed_data = jio.replace_phone_number(self.parsed_data, "COSCO_NUMBER")
@@ -259,10 +277,6 @@ class PrivacyDesensitization:
259
277
  self.parsed_data = jio.replace_qq(self.parsed_data, "COSCO_NUMBER")
260
278
  # ID card
261
279
  self.parsed_data = jio.replace_id_card(self.parsed_data, "COSCO_NUMBER")
262
- # Bank card
263
- self.parsed_data = self.replace_bank_id(
264
- self.parsed_data, token="COSCO_NUMBER"
265
- ) # nosec B106 - 这是数据脱敏标记,不是密码
266
280
 
267
281
  return self.parsed_data
268
282
 
@@ -1,11 +1,12 @@
1
+ import importlib.metadata
2
+ import os
1
3
  import subprocess
2
4
  import sys
3
- import os
4
- import importlib.metadata
5
+
5
6
 
6
7
  class EnvironmentSetup:
7
- """ Responsible for setting up the correct environment,
8
- including checking GPU support and installing the necessary packages
8
+ """Responsible for setting up the correct environment,
9
+ including checking GPU support and installing the necessary packages
9
10
  """
10
11
 
11
12
  def __init__(self, use_gpu: bool = False):
@@ -18,36 +19,40 @@ class EnvironmentSetup:
18
19
  if self._gpu_available is None:
19
20
  try:
20
21
  # Check whether CUDA is available
21
- subprocess.check_output(['nvcc', '--version'], stderr=subprocess.STDOUT)
22
+ subprocess.check_output(["nvcc", "--version"], stderr=subprocess.STDOUT)
22
23
  self._gpu_available = True
23
24
  except (subprocess.CalledProcessError, FileNotFoundError):
24
25
  self._gpu_available = False
25
26
  return self._gpu_available
26
27
 
27
28
  def is_conda(self):
28
- """ Check whether the current environment is a Conda environment """
29
- return os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
29
+ """Check whether the current environment is a Conda environment"""
30
+ return os.path.exists(os.path.join(sys.prefix, "conda-meta"))
30
31
 
31
32
  def install_package(self, package_name):
32
- """ Select pip or conda or other installation specified package according to the environment """
33
- installer = 'conda' if self.is_conda() else 'pip'
34
- if installer == 'conda':
33
+ """Select pip or conda or other installation specified package according to the environment"""
34
+ installer = "conda" if self.is_conda() else "pip"
35
+ if installer == "conda":
35
36
  print(f"Detected Conda environment. Installing {package_name} with conda.")
36
37
  try:
37
- subprocess.check_call(['pip', 'install', package_name])
38
+ subprocess.check_call(["pip", "install", package_name])
38
39
  print(f"Successfully installed {package_name} with conda.")
39
40
  except subprocess.CalledProcessError as e:
40
41
  print(f"Failed to install {package_name} with conda: {e}")
41
- elif installer == 'pip':
42
+ elif installer == "pip":
42
43
  print(f"Using pip to install {package_name}.")
43
44
  try:
44
45
  # Invoke the pip installation package using the Python interpreter
45
- subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
46
+ subprocess.check_call(
47
+ [sys.executable, "-m", "pip", "install", package_name]
48
+ )
46
49
  print(f"Successfully installed {package_name} with pip.")
47
50
  except subprocess.CalledProcessError as e:
48
51
  print(f"Failed to install {package_name} with pip: {e}")
49
52
  else:
50
- print("Unable to determine the package manager. Please install the package manually.")
53
+ print(
54
+ "Unable to determine the package manager. Please install the package manually."
55
+ )
51
56
 
52
57
  def check_and_install(self):
53
58
  """Check and install appropriate packages based on user's choice and GPU availability"""
@@ -56,12 +61,14 @@ class EnvironmentSetup:
56
61
 
57
62
  # Override GPU detection with the use_gpu parameter
58
63
  if self.use_gpu:
59
- pkg_name = 'paddlepaddle-gpu' if self.is_gpu_available() else 'paddlepaddle'
64
+ pkg_name = "paddlepaddle-gpu" if self.is_gpu_available() else "paddlepaddle"
60
65
  else:
61
- pkg_name = 'paddlepaddle'
66
+ pkg_name = "paddlepaddle"
62
67
 
63
68
  try:
64
- _ = importlib.metadata.version(pkg_name.split()[0]) # Check if paddlepaddle is installed
69
+ _ = importlib.metadata.version(
70
+ pkg_name.split()[0]
71
+ ) # Check if paddlepaddle is installed
65
72
  # print(f"{pkg_name} version {1} is already installed.")
66
73
  except importlib.metadata.PackageNotFoundError:
67
74
  print(f"{pkg_name} is not installed. Installing now...")
@@ -77,4 +84,4 @@ env_setup = EnvironmentSetup() # Set this flag as needed
77
84
  def setup_environment(use_gpu: bool = False):
78
85
  """Used to set the environment when the program starts"""
79
86
  env_setup.use_gpu = use_gpu
80
- env_setup.check_and_install()
87
+ env_setup.check_and_install()