re-common 10.0.24__py3-none-any.whl → 10.0.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -75,6 +75,7 @@ def get_first_author(author: str) -> str:
75
75
  return au
76
76
  return ""
77
77
 
78
+
78
79
  def get_author_list(author: str):
79
80
  lists = []
80
81
  if not author:
@@ -0,0 +1,53 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Generator
3
+
4
+
5
+ class BaseFileReader(ABC):
6
+
7
+ def __init__(self, batch_size: int = 10000):
8
+ self.batch_size = batch_size
9
+ self.read_model = 1
10
+
11
+ @abstractmethod
12
+ def list_files(self, path: str) -> List[str]:
13
+ """列出路径下所有目标文件"""
14
+ pass
15
+
16
+ @abstractmethod
17
+ def count_lines(self, file_path: str) -> int:
18
+ """统计文件行数"""
19
+ pass
20
+
21
+ @abstractmethod
22
+ def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
23
+ """读取文件内容,返回批量数据"""
24
+ pass
25
+
26
+ @abstractmethod
27
+ def read_all(self, file_path: str) -> List[List[str]]:
28
+ """读取整个文件,默认按1000行分批"""
29
+ return [line for line in self.read_lines(file_path)]
30
+
31
+ def read_select(self, file_path: str) -> Generator[List[str], None, None]:
32
+ if self.read_model == 1:
33
+ for batch_data in self.read_lines(file_path):
34
+ yield batch_data
35
+ elif self.read_model == 2:
36
+ for batch_data in self.read_all(file_path):
37
+ yield batch_data
38
+ else:
39
+ raise Exception("模式选择错误")
40
+
41
+
42
+ class BaseFileWriter(ABC):
43
+
44
+ def __init__(self, file_path: str, compress: bool = True, overwrite: bool = True, encoding: str = "utf-8"):
45
+ self.file_path = file_path
46
+ self.compress = compress
47
+ self.encoding = encoding
48
+ self.overwrite = overwrite
49
+
50
+ @abstractmethod
51
+ def write_lines(self, lines: List[str], file_path: str):
52
+ """写入多行文本到文件,支持压缩"""
53
+ pass
@@ -0,0 +1,508 @@
1
+ import asyncio
2
+ import os
3
+ import re
4
+ import sqlite3
5
+ import time
6
+ import traceback
7
+ from concurrent.futures import ProcessPoolExecutor
8
+ from pathlib import Path
9
+ from typing import List, Callable, Any
10
+
11
+ from filelock import FileLock
12
+ from tenacity import retry, stop_after_attempt, wait_exponential, wait_random, retry_if_result
13
+
14
+ from re_common.v2.baselibrary.tools.data_processer.base import BaseFileReader, BaseFileWriter
15
+
16
+
17
+ class DatabaseHandler:
18
+ def __init__(self, db_file="processed_files.db"):
19
+ self.db_file = db_file
20
+ self.lock_file = f"{self.db_file}.lock"
21
+ self._init_db()
22
+
23
+ def _init_db(self):
24
+ with FileLock(self.lock_file):
25
+ """初始化 SQLite 数据库"""
26
+ with sqlite3.connect(self.db_file) as conn:
27
+ cursor = conn.cursor()
28
+ cursor.execute("""
29
+ CREATE TABLE IF NOT EXISTS processed_files (
30
+ file_path TEXT PRIMARY KEY
31
+ )
32
+ """)
33
+ conn.commit()
34
+
35
+ def save_processed_file(self, file_path):
36
+ """保存处理过的文件"""
37
+ with FileLock(self.lock_file):
38
+ with sqlite3.connect(self.db_file) as conn:
39
+ cursor = conn.cursor()
40
+ cursor.execute(
41
+ "INSERT OR IGNORE INTO processed_files (file_path) VALUES (?)",
42
+ (file_path,)
43
+ )
44
+ conn.commit()
45
+
46
+ def save_processed_files_many(self, file_paths):
47
+ """批量保存处理过的文件路径"""
48
+ if not file_paths:
49
+ return
50
+ with FileLock(self.lock_file):
51
+ with sqlite3.connect(self.db_file) as conn:
52
+ cursor = conn.cursor()
53
+ cursor.executemany(
54
+ "INSERT OR IGNORE INTO processed_files (file_path) VALUES (?)",
55
+ ((fp,) for fp in file_paths)
56
+ )
57
+ conn.commit()
58
+
59
+ def is_file_processed(self, file_path):
60
+ """检查文件是否已处理"""
61
+ with FileLock(self.lock_file):
62
+ with sqlite3.connect(self.db_file) as conn:
63
+ cursor = conn.cursor()
64
+ cursor.execute(
65
+ "SELECT file_path FROM processed_files WHERE file_path = ?",
66
+ (file_path,)
67
+ )
68
+ result = cursor.fetchone()
69
+ return result is not None
70
+
71
+ def fake_processed_files(self, start_index, end_index, file_list):
72
+ try:
73
+ # 将字符串序号转换为整数
74
+ start = int(start_index)
75
+ end = int(end_index)
76
+
77
+ # 验证序号范围
78
+ if start >= end:
79
+ raise ValueError(f"起始序号 {start_index} 必须小于结束序号 {end_index}")
80
+
81
+ list_formatted_num = []
82
+ # 为范围内的每个序号生成文件名
83
+ for num in range(start, end):
84
+ # 将序号格式化为5位字符串 (00120, 00121,...)
85
+ formatted_num = f"{num:05d}"
86
+ list_formatted_num.append(formatted_num)
87
+
88
+ skip_path_list = []
89
+ skip_formatted_num = []
90
+ for file_path in file_list:
91
+ re_f_num = re.findall(r'(?<!\d)\d{5}(?!\d)', str(Path(file_path).stem))
92
+ if re_f_num:
93
+ if re_f_num[0] in list_formatted_num:
94
+ skip_path_list.append(file_path)
95
+ skip_formatted_num.append(re_f_num[0])
96
+
97
+ for item_list in [skip_path_list[i:i + 2000] for i in range(0, len(skip_path_list), 2000)]:
98
+ self.save_processed_files_many(item_list)
99
+ for file_path in item_list:
100
+ print(f"伪造处理记录: {file_path}")
101
+
102
+ no_fil_num = set(list_formatted_num) - set(skip_formatted_num)
103
+ if len(no_fil_num) > 0:
104
+ print(f"没有对应num的文件,伪造失败数量为{len(no_fil_num)},样例:{list(no_fil_num)[:10]}")
105
+ print(f"成功伪造处理记录:序号 {start_index} 到 {end_index}(不含)的文件")
106
+
107
+ except ValueError as e:
108
+ print(f"错误: 序号格式无效 - {str(e)}")
109
+ except Exception as e:
110
+ print(f"伪造处理记录时出错: {str(e)}")
111
+
112
+
113
+ class DataProcessor:
114
+ def __init__(
115
+ self,
116
+ reader: BaseFileReader,
117
+ writer: BaseFileWriter = None,
118
+ db_handler: DatabaseHandler = None,
119
+ db_file="processed_files.db",
120
+ batch_size=50,
121
+ retry_limit=3,
122
+ ):
123
+ self.reader = reader
124
+ self.writer = writer
125
+ self.db_file = db_file
126
+ self.batch_size = batch_size
127
+ self.retry_limit = retry_limit
128
+ self.db_handler = db_handler if db_handler else DatabaseHandler(db_file=db_file)
129
+
130
+ async def retry_process_data(self, data, process_func):
131
+ """处理数据并执行处理函数"""
132
+
133
+ def on_retry(retry_state):
134
+ # 每次抛错进入该函数打印消息
135
+ print(
136
+ f"重试次数: {retry_state.attempt_number}/{self.retry_limit},数据内容: {retry_state.args[0]}\n"
137
+ f"异常信息: {retry_state.outcome.exception()}"
138
+ )
139
+
140
+ def on_retry_error(retry_state):
141
+ # 最后抛错后调用
142
+ original_exc = retry_state.outcome.exception()
143
+ raise RuntimeError(
144
+ f"处理数据失败,达到重试上限。data: {retry_state.args[0]}") from original_exc # 抛出的自定义异常中 保留 __process_func() 里的原始错误堆栈信息(traceback)
145
+
146
+ @retry(stop=stop_after_attempt(3),
147
+ wait=wait_exponential(multiplier=1, min=2, max=20),
148
+ before_sleep=on_retry, # 每次抛错后使用
149
+ retry_error_callback=on_retry_error, # 如果到最后都没有成功 抛错
150
+ reraise=True)
151
+ async def __process_func(_data):
152
+ return await process_func(_data)
153
+
154
+ return await __process_func(data)
155
+
156
+ async def process_file(self, hdfs_file_path, process_func, write_dir):
157
+ """处理单个 gz 文件"""
158
+ total_lines = self.reader.count_lines(hdfs_file_path)
159
+ processed_lines = 0
160
+ start_time = time.time()
161
+ results = []
162
+ # # 这里根据不同的配置选用不同的读取文件的方法
163
+ for lines in self.reader.read_select(hdfs_file_path):
164
+ processing_start_time = time.time() # 记录本批处理开始时间
165
+
166
+ tasks = []
167
+ for line in lines:
168
+ # try:
169
+ # data = json.loads(line)
170
+ # tasks.append(self.retry_process_data(data, process_func))
171
+ # except json.JSONDecodeError as e:
172
+ # raise Exception(f"解析JSON失败: {e}, 行内容: {line.strip()}")
173
+ tasks.append(self.retry_process_data(line, process_func))
174
+
175
+ # await AsyncTaskPool(self.batch_size).run(tasks) # AsyncTaskPool 适用于一次提交所有任务, 限制并发数执行
176
+ results.extend(await asyncio.gather(*tasks))
177
+
178
+ processed_lines += len(lines)
179
+
180
+ elapsed_time = time.time() - start_time # 已用时间
181
+ processing_time = time.time() - processing_start_time # 本次处理时间
182
+ avg_processing_time = (
183
+ (elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
184
+ ) # 平均每条数据的处理时间(毫秒)
185
+
186
+ # 估算剩余时间
187
+ remaining_time = (
188
+ ((avg_processing_time / 1000) * (total_lines - processed_lines))
189
+ if processed_lines > 0
190
+ else float("inf")
191
+ )
192
+
193
+ # 显示总进度信息
194
+ print(
195
+ f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
196
+ f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
197
+ f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
198
+ )
199
+
200
+ if write_dir is not None:
201
+ if not self.writer:
202
+ raise Exception("没有配置写数据的对象")
203
+ write_path = write_dir.rstrip("/") + f"/{Path(hdfs_file_path).stem}"
204
+ self.writer.write_lines([str(item) for item in results], write_path)
205
+
206
+ # 最终进度显示
207
+ final_elapsed_time = time.time() - start_time # 最终已用时间
208
+ print(
209
+ f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
210
+ f"总已用时间: {final_elapsed_time:.2f}秒 | "
211
+ f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
212
+ if processed_lines > 0
213
+ else "处理无数据"
214
+ )
215
+
216
+ self.db_handler.save_processed_file(hdfs_file_path) # 保存处理过的文件
217
+
218
+ async def retry_process_file(self, hdfs_file_path, process_func, write_dir):
219
+ """带重试机制的文件处理"""
220
+
221
+ def on_retry(retry_state):
222
+ # 每次抛错进入该函数打印消息
223
+ exc = retry_state.outcome.exception()
224
+ tb = ''.join(traceback.format_exception(type(exc), exc, exc.__traceback__))
225
+ print(tb)
226
+
227
+ print(
228
+ f"处理文件 {retry_state.args[0]} 时发生错误: {exc},正在重试 {retry_state.attempt_number}/{self.retry_limit}")
229
+
230
+ def on_retry_error(retry_state):
231
+ # 最后抛错后调用
232
+ print(f"处理文件 {retry_state.args[0]} 失败,达到重试上限")
233
+ return False
234
+
235
+ @retry(stop=stop_after_attempt(3),
236
+ wait=wait_exponential(multiplier=1, min=2, max=20),
237
+ before_sleep=on_retry, # 每次抛错后使用
238
+ retry_error_callback=on_retry_error, # 如果最后没成功 返回 False
239
+ reraise=True)
240
+ async def __process_func(_hdfs_file_path, _process_func, _write_dir):
241
+ await self.process_file(_hdfs_file_path, _process_func, _write_dir)
242
+ return True # 成功处理后退出
243
+
244
+ return await __process_func(hdfs_file_path, process_func, write_dir)
245
+
246
+ def get_file_list(self, hdfs_dir):
247
+ # 获取所有任务文件
248
+ all_files = self.reader.list_files(hdfs_dir)
249
+ for file_path in all_files:
250
+ yield file_path
251
+
252
+ @retry(stop=stop_after_attempt(3),
253
+ wait=wait_random(min=10, max=30),
254
+ retry=retry_if_result(lambda result: not result), # 如果返回值是 False(失败),则重试 最后会抛出一个默认错误tenacity.RetryError:
255
+ reraise=True)
256
+ async def _batch_process_file(self, hdfs_file_path: str, process_func: Callable[[str], Any],
257
+ write_dir: str = None):
258
+ """批量更新所有 gz 文件"""
259
+ # all_succeed = True
260
+ # for hdfs_file_path in self.get_file_list(hdfs_dir):
261
+ # if self.db_handler.is_file_processed(hdfs_file_path):
262
+ # print(f"跳过已处理文件: {hdfs_file_path}")
263
+ # continue # 如果文件已处理,跳过
264
+ # succeed = await self.retry_process_file(hdfs_file_path, process_func, write_dir) # 处理文件
265
+ # if succeed is False:
266
+ # all_succeed = False
267
+ #
268
+ # if all_succeed:
269
+ # # 处理完成后删除数据库文件
270
+ # try:
271
+ # if os.path.exists(self.db_file):
272
+ # os.remove(self.db_file)
273
+ # print(f"已删除断点重试文件: {self.db_file}")
274
+ # return True
275
+ # except Exception as e:
276
+ # print(f"删除断点重试文件失败: {e}")
277
+ # return False
278
+ if self.db_handler.is_file_processed(hdfs_file_path):
279
+ print(f"跳过已处理文件: {hdfs_file_path}")
280
+ return True # 如果文件已处理,跳过
281
+ succeed = await self.retry_process_file(hdfs_file_path, process_func, write_dir) # 处理文件
282
+ return succeed
283
+
284
+ async def process_file_bulk(self, hdfs_file_path, process_func, write_dir):
285
+ """按批次处理单个文件,批量数据传递给处理函数"""
286
+ # 获取文件的数据总量
287
+ total_lines = self.reader.count_lines(hdfs_file_path)
288
+ processed_lines = 0
289
+ start_time = time.time()
290
+
291
+ results = []
292
+ tasks = []
293
+ # 这里根据不同的配置选用不同的读取文件的方法
294
+ for lines in self.reader.read_select(hdfs_file_path):
295
+ processing_start_time = time.time() # 记录本批处理开始时间
296
+
297
+ # batch_data = []
298
+ # for line in lines:
299
+ # try:
300
+ # data = json.loads(line)
301
+ # batch_data.append(data)
302
+ # except json.JSONDecodeError as e:
303
+ # raise Exception(f"解析JSON失败: {e}, 行内容: {line.strip()}")
304
+
305
+ # 处理读取到的批次数据
306
+ if lines:
307
+ tasks.append(process_func(lines)) # 将批次数据传递给处理函数并收集任务
308
+ processed_lines += len(lines) # 更新已处理行数
309
+
310
+ # 当积累的任务数量达到 batch_size 时并发处理所有任务
311
+ if len(tasks) >= self.batch_size:
312
+ results.extend(await asyncio.gather(*tasks))
313
+ elapsed_time = time.time() - start_time # 已用时间
314
+ processing_time = time.time() - processing_start_time # 本次处理时间
315
+ avg_processing_time = (
316
+ (elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
317
+ ) # 平均每条数据的处理时间(毫秒)
318
+
319
+ # 估算剩余时间
320
+ remaining_time = (
321
+ ((avg_processing_time / 1000) * (total_lines - processed_lines))
322
+ if processed_lines > 0
323
+ else float("inf")
324
+ )
325
+
326
+ # 显示总进度信息
327
+ print(
328
+ f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
329
+ f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
330
+ f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
331
+ )
332
+
333
+ # 清空任务列表,准备下一批处理
334
+ tasks.clear()
335
+ # 处理剩余的任务
336
+ if tasks:
337
+ results.extend(await asyncio.gather(*tasks)) # 处理未达到 batch_size 的剩余任务
338
+
339
+ if write_dir is not None:
340
+ if not self.writer:
341
+ raise Exception("没有配置写数据的对象")
342
+ write_path = write_dir.rstrip("/") + f"/{Path(hdfs_file_path).stem}"
343
+ self.writer.write_lines([str(item) for items in results for item in items], write_path)
344
+
345
+ # 最终进度显示
346
+ final_elapsed_time = time.time() - start_time # 最终已用时间
347
+ print(
348
+ f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
349
+ f"总已用时间: {final_elapsed_time:.2f}秒 | "
350
+ f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
351
+ if processed_lines > 0
352
+ else "处理无数据"
353
+ )
354
+
355
+ self.db_handler.save_processed_file(hdfs_file_path)
356
+
357
+ async def retry_process_file_bulk(self, hdfs_file_path, process_func, write_dir):
358
+ """带重试机制的批量文件处理"""
359
+
360
+ def on_retry(retry_state):
361
+ # 每次抛错进入该函数打印消息
362
+ exc = retry_state.outcome.exception()
363
+ tb = ''.join(traceback.format_exception(type(exc), exc, exc.__traceback__))
364
+ print(tb)
365
+ print(
366
+ f"处理文件 {retry_state.args[0]} 时发生错误: {exc},正在重试 {retry_state.attempt_number}/{self.retry_limit}")
367
+
368
+ def on_retry_error(retry_state):
369
+ # 最后抛错后调用
370
+ print(f"处理文件 {retry_state.args[0]} 失败,达到重试上限")
371
+ return False
372
+
373
+ @retry(stop=stop_after_attempt(3),
374
+ wait=wait_exponential(multiplier=1, min=2, max=20),
375
+ before_sleep=on_retry, # 每次抛错后使用
376
+ retry_error_callback=on_retry_error, # 如果最后没成功 返回 False
377
+ reraise=True)
378
+ async def __process_func(_hdfs_file_path, _process_func, write_dir):
379
+ await self.process_file_bulk(_hdfs_file_path, _process_func, write_dir)
380
+ return True # 成功处理后退出
381
+
382
+ return await __process_func(hdfs_file_path, process_func, write_dir)
383
+
384
+ async def batch_process_file(self, hdfs_dir: str, process_func: Callable[[List[str]], Any] | Callable[[str], Any],
385
+ write_dir: str = None, is_bulk: bool = False):
386
+ all_succeed = True
387
+ for hdfs_file_path in self.get_file_list(hdfs_dir):
388
+ if is_bulk:
389
+ succeed = await self._batch_process_file_bulk(hdfs_file_path, process_func, write_dir)
390
+ else:
391
+ succeed = await self._batch_process_file(hdfs_file_path, process_func, write_dir)
392
+ if succeed is False:
393
+ all_succeed = False
394
+ if all_succeed:
395
+ # 处理完成后删除数据库文件
396
+ try:
397
+ if os.path.exists(self.db_file):
398
+ os.remove(self.db_file)
399
+ print(f"已删除断点重试文件: {self.db_file}")
400
+ return True
401
+ except Exception as e:
402
+ print(f"删除断点重试文件失败: {e}")
403
+ return False
404
+
405
+ @retry(stop=stop_after_attempt(3),
406
+ wait=wait_random(min=10, max=30),
407
+ retry=retry_if_result(lambda result: not result), # 如果返回值是 False(失败),则重试 最后会抛出一个默认错误tenacity.RetryError:
408
+ reraise=True)
409
+ async def _batch_process_file_bulk(self, hdfs_file_path: str, process_func: Callable[[List[str]], Any],
410
+ write_dir: str = None):
411
+ """批量处理 gz 文件中的数据"""
412
+ # 获取所有文件
413
+ # all_succeed = True
414
+ # for hdfs_file_path in self.get_file_list(hdfs_dir):
415
+ # # 查看是否跳过文件
416
+ # if self.db_handler.is_file_processed(hdfs_file_path):
417
+ # print(f"跳过已处理文件: {hdfs_file_path}")
418
+ # continue # 跳过已处理文件
419
+ # # 开始批量处理文件
420
+ # succeed = await self.retry_process_file_bulk(hdfs_file_path, process_func, write_dir)
421
+ # if succeed is False:
422
+ # all_succeed = False
423
+ #
424
+ # if all_succeed:
425
+ # # 处理完成后删除数据库文件
426
+ # try:
427
+ # if os.path.exists(self.db_file):
428
+ # os.remove(self.db_file)
429
+ # print(f"已删除断点重试文件: {self.db_file}")
430
+ # return True
431
+ # except Exception as e:
432
+ # print(f"删除断点重试文件失败: {e}")
433
+ # return False
434
+ # 查看是否跳过文件
435
+ if self.db_handler.is_file_processed(hdfs_file_path):
436
+ print(f"跳过已处理文件: {hdfs_file_path}")
437
+ return True # 跳过已处理文件
438
+ # 开始批量处理文件
439
+ succeed = await self.retry_process_file_bulk(hdfs_file_path, process_func, write_dir)
440
+ return succeed
441
+
442
+
443
+ # 全局变量,每个进程独立持有
444
+ _processor: DataProcessor | None = None
445
+ _process_func: Callable[[List[str]], Any] | Callable[[str], Any] | None = None
446
+ _process_args: dict
447
+
448
+
449
+ def get_data_processor_func(process_args):
450
+ _func_reader = process_args["reader_func"]
451
+ _reader_args = process_args["reader_kwargs"]
452
+ reader = _func_reader(**_reader_args)
453
+ writer = None
454
+ if process_args["is_writer"]:
455
+ _func_writer = process_args["writer_func"]
456
+ _writer_args = process_args["writer_kwargs"]
457
+ writer = _func_writer(**_writer_args)
458
+
459
+ data_kwargs = {
460
+ "reader": reader,
461
+ "writer": writer,
462
+ "db_file": process_args["db_file"]
463
+ }
464
+ if process_args.get("batch_size"):
465
+ data_kwargs["batch_size"] = process_args["batch_size"]
466
+ if process_args.get("retry_limit"):
467
+ data_kwargs["retry_limit"] = process_args["retry_limit"]
468
+
469
+ return DataProcessor(**data_kwargs)
470
+
471
+
472
+ def init_worker(process_func, process_args):
473
+ global _processor, _process_func, _process_args
474
+ _processor = get_data_processor_func(process_args)
475
+ _process_func = process_func
476
+ _process_args = process_args
477
+
478
+ _init_func = _process_args.get("init_work", None)
479
+ if _init_func:
480
+ _init_func()
481
+
482
+ _async_init_work = _process_args.get("async_init_work", None)
483
+ if _init_func:
484
+ asyncio.run(_async_init_work())
485
+
486
+
487
+ def worker(path_file):
488
+ if _process_args["is_bulk"]:
489
+ return asyncio.run(_processor._batch_process_file_bulk(path_file, _process_func, _process_args["write_dir"]))
490
+ else:
491
+ return asyncio.run(_processor._batch_process_file(path_file, _process_func, _process_args["write_dir"]))
492
+
493
+
494
+ def run_worker_many(hdfs_dir: str, process_func: Callable[[List[str]], Any] | Callable[[str], Any],
495
+ data_process_args: dict, max_workers=4):
496
+ processor = get_data_processor_func(data_process_args)
497
+ all_file = list(processor.get_file_list(hdfs_dir))
498
+ with ProcessPoolExecutor(
499
+ max_workers=max_workers,
500
+ initializer=init_worker,
501
+ initargs=(process_func, data_process_args)
502
+ ) as executor:
503
+ # 提交任务并等待结果
504
+ results = executor.map(worker, all_file)
505
+ # 输出结果
506
+ for result in results:
507
+ if result:
508
+ print(result)
@@ -0,0 +1,187 @@
1
+ import gzip
2
+ import io
3
+ import json
4
+ from io import BytesIO
5
+ from pathlib import Path
6
+ from typing import List, Generator
7
+
8
+ import pandas as pd
9
+ from hdfs import InsecureClient
10
+
11
+ from re_common.v2.baselibrary.tools.data_processer.base import BaseFileReader
12
+
13
+
14
+ class HDFSFileReader(BaseFileReader):
15
+ def __init__(self, batch_size: int = 1000, hdfs_url: str = "http://VIP-DC-MASTER-2:9870", hdfs_user: str = "root"):
16
+ super().__init__(batch_size)
17
+ self.client = InsecureClient(hdfs_url, user=hdfs_user)
18
+
19
+ def list_files(self, path: str) -> List[str]:
20
+ return [f"{path}/{f[0]}" for f in self.client.list(path, status=True) if f[0] != '_SUCCESS']
21
+
22
+ def count_lines(self, file_path: str) -> int:
23
+ with self.client.read(file_path) as f:
24
+ return sum(1 for _ in f)
25
+
26
+ def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
27
+ # 批量读取后 处理 缺点 连接可能会断
28
+ with self.client.read(file_path) as f:
29
+ while True:
30
+ batch = []
31
+ for _ in range(self.batch_size):
32
+ try:
33
+ line = next(f)
34
+ line = line.decode('utf-8')
35
+ if line.strip():
36
+ batch.append(line.strip())
37
+ except StopIteration:
38
+ break
39
+ if not batch:
40
+ break
41
+ yield batch
42
+
43
+ def read_all(self, file_path: str) -> List[List[str]]:
44
+ # 一次读取返回所有后批量处理缺点 内存占用
45
+ with self.client.read(file_path) as f:
46
+ lines = [line.decode('utf-8').strip() for line in f if line.decode('utf-8').strip()]
47
+ return [lines[i: i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
48
+
49
+
50
+ class HDFSGZFileReader(BaseFileReader):
51
+ def __init__(self, batch_size: int = 1000, hdfs_url: str = "http://VIP-DC-MASTER-2:9870", hdfs_user: str = "root"):
52
+ super().__init__(batch_size)
53
+ self.hdfs_url = hdfs_url
54
+ self.hdfs_user = hdfs_user
55
+ self.client = None
56
+
57
+ def _init_client(self):
58
+ if self.client is None:
59
+ self.client = InsecureClient(self.hdfs_url, user=self.hdfs_user)
60
+ return self
61
+
62
+ def list_files(self, path: str) -> List[str]:
63
+ self._init_client()
64
+ return [f"{path}/{f[0]}" for f in self.client.list(path, status=True) if f[0].endswith(".gz")]
65
+
66
+ def count_lines(self, file_path: str) -> int:
67
+ self._init_client()
68
+ with self.client.read(file_path) as f:
69
+ with gzip.GzipFile(fileobj=f) as gz:
70
+ return sum(1 for _ in gz)
71
+
72
+ def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
73
+ self._init_client()
74
+ # 批量读取后 处理 缺点 连接可能会断
75
+ with self.client.read(file_path) as f:
76
+ with gzip.GzipFile(fileobj=f) as gz:
77
+ while True:
78
+ batch = []
79
+ for _ in range(self.batch_size):
80
+ try:
81
+ line = next(gz)
82
+ if line.strip():
83
+ batch.append(line.decode("utf-8"))
84
+ except StopIteration:
85
+ break
86
+ if not batch:
87
+ break
88
+ yield batch
89
+
90
+ def read_all(self, file_path: str) -> List[List[str]]:
91
+ self._init_client()
92
+ # 一次读取返回所有后批量处理缺点 内存占用
93
+ with self.client.read(file_path) as reader:
94
+ compressed_data = reader.read()
95
+ with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file:
96
+ content = gz_file.read().decode("utf-8")
97
+ lines = [i for i in content.split("\n") if i.strip()]
98
+ return [lines[i: i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
99
+
100
+
101
+ class HDFSParquetFileReader(BaseFileReader):
102
+ def __init__(self, batch_size: int = 1000, hdfs_url: str = "http://VIP-DC-MASTER-2:9870", hdfs_user: str = "root"):
103
+ super().__init__(batch_size)
104
+ self.client = InsecureClient(hdfs_url, user=hdfs_user)
105
+
106
+ def list_files(self, path: str) -> List[str]:
107
+ return [f"{path}/{f[0]}" for f in self.client.list(path, status=True) if f[0].endswith(".parquet")]
108
+
109
+ def count_lines(self, file_path: str) -> int:
110
+ with self.client.read(file_path) as f:
111
+ data = f.read()
112
+ df = pd.read_parquet(io.BytesIO(data))
113
+ count = len(df)
114
+ return count
115
+
116
+ def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
117
+ # 批量读取后 处理 缺点 连接可能会断
118
+ with self.client.read(file_path) as f:
119
+ data = f.read()
120
+ df = pd.read_parquet(io.BytesIO(data))
121
+ records = [json.dumps(row, ensure_ascii=False) for row in df.to_dict(orient='records')]
122
+ for i in range(0, len(records), self.batch_size):
123
+ yield records[i: i + self.batch_size]
124
+
125
+ def read_all(self, file_path: str) -> List[List[str]]:
126
+ # 一次读取返回所有后批量处理缺点 内存占用
127
+ with self.client.read(file_path) as f:
128
+ data = f.read()
129
+ df = pd.read_parquet(io.BytesIO(data))
130
+ records = [json.dumps(row, ensure_ascii=False) for row in df.to_dict(orient='records')]
131
+ return [records[i: i + self.batch_size] for i in range(0, len(records), self.batch_size)]
132
+
133
+
134
+ class LocalGZFileReader(BaseFileReader):
135
+ def list_files(self, path: str) -> List[str]:
136
+ return [str(p) for p in Path(path).rglob("*.gz")]
137
+
138
+ def count_lines(self, file_path: str) -> int:
139
+ with gzip.open(file_path, 'rt', encoding='utf-8') as f:
140
+ return sum(1 for _ in f)
141
+
142
+ def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
143
+ with gzip.open(file_path, 'rt', encoding='utf-8') as f:
144
+ while True:
145
+ batch = []
146
+ for _ in range(self.batch_size):
147
+ line = f.readline()
148
+ if not line:
149
+ break
150
+ if line.strip():
151
+ batch.append(line.strip())
152
+ if not batch:
153
+ break
154
+ yield batch
155
+
156
+ def read_all(self, file_path: str) -> List[List[str]]:
157
+ with gzip.open(file_path, 'rt', encoding='utf-8') as f:
158
+ lines = [line.strip() for line in f if line.strip()]
159
+ return [lines[i: i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
160
+
161
+
162
+ class LocalFileReader(BaseFileReader):
163
+ def list_files(self, path: str) -> List[str]:
164
+ return [str(p) for p in Path(path).rglob("*") if p.is_file()]
165
+
166
+ def count_lines(self, file_path: str) -> int:
167
+ with open(file_path, 'r', encoding='utf-8') as f:
168
+ return sum(1 for _ in f)
169
+
170
+ def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
171
+ with open(file_path, 'r', encoding='utf-8') as f:
172
+ while True:
173
+ batch = []
174
+ for _ in range(self.batch_size):
175
+ line = f.readline()
176
+ if not line:
177
+ break
178
+ if line.strip():
179
+ batch.append(line.strip())
180
+ if not batch:
181
+ break
182
+ yield batch
183
+
184
+ def read_all(self, file_path: str) -> List[List[str]]:
185
+ with open(file_path, 'r', encoding='utf-8') as f:
186
+ lines = [line.strip() for line in f if line.strip()]
187
+ return [lines[i: i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
@@ -0,0 +1,38 @@
1
+ import gzip
2
+ from io import BytesIO
3
+ from typing import List
4
+
5
+ from hdfs import InsecureClient
6
+
7
+ from re_common.v2.baselibrary.tools.data_processer.base import BaseFileWriter
8
+
9
+
10
+ class HDFSFileWriter(BaseFileWriter):
11
+ def __init__(self, file_path: str, hdfs_url: str, hdfs_user: str, *args, **kwargs):
12
+ super().__init__(file_path, *args, **kwargs)
13
+ self.client = InsecureClient(hdfs_url, user=hdfs_user)
14
+
15
+ def write_lines(self, lines: List[str], file_path: str = None):
16
+ if file_path is None:
17
+ file_path = self.file_path
18
+ data = "\n".join(lines).encode(self.encoding)
19
+ if self.compress:
20
+ buf = BytesIO()
21
+ with gzip.GzipFile(fileobj=buf, mode="wb") as gz:
22
+ gz.write(data)
23
+ buf.seek(0)
24
+ self.client.write(file_path, data=buf, overwrite=self.overwrite)
25
+ else:
26
+ self.client.write(file_path, data=data, overwrite=self.overwrite)
27
+
28
+
29
+ class LocalFileWriter(BaseFileWriter):
30
+ def write_lines(self, lines: List[str], file_path: str, compress: bool = True, encoding="utf-8"):
31
+ if compress:
32
+ with gzip.open(file_path, 'wt', encoding=encoding) as f:
33
+ for line in lines:
34
+ f.write(f"{line}\n")
35
+ else:
36
+ with open(file_path, 'w', encoding=encoding) as f:
37
+ for line in lines:
38
+ f.write(f"{line}\n")
@@ -35,3 +35,10 @@ class DotDict(dict):
35
35
  else:
36
36
  result[key] = value
37
37
  return result
38
+
39
+
40
+ def none_to_empty_str(d):
41
+ for k, v in d.items():
42
+ if v is None:
43
+ d[k] = ""
44
+ return d
@@ -63,4 +63,8 @@ def list_to_dict(list_data,key_name):
63
63
 
64
64
  # 将 defaultdict 转换成普通字典
65
65
  dict_data = dict(dict_data)
66
- return dict_data
66
+ return dict_data
67
+
68
+ def split_list_by_step(lst, step=100):
69
+ # 一维列表按照步长转换成二维列表
70
+ return [lst[i:i + step] for i in range(0, len(lst), step)]
@@ -0,0 +1,270 @@
1
+ import atexit
2
+ import sys
3
+ import asyncio
4
+ import aiohttp
5
+ from typing import Optional
6
+
7
+ from tenacity import retry, stop_after_attempt, wait_random
8
+
9
+ g_headers = {
10
+ 'accept': 'application/json',
11
+ 'Content-Type': 'application/json',
12
+ 'Authorization': 'Bearer eyJhbGciOiJub25lIiwidHlwIjoiSldUIn0.eyJ1c2VyX2lkIjotMSwidXNlcl9uYW1lIjoiXHU1ZTk0XHU3NTI4XHU0ZTJkXHU1ZmMzQ2xpZW50In0.'
13
+ }
14
+
15
+ """
16
+ cls._conn = aiohttp.TCPConnector(
17
+ limit=50, # 最大连接数
18
+ ssl=False, # 禁用SSL验证(按需开启)
19
+ force_close=True, # 保持连接活跃
20
+ enable_cleanup_closed=True # 自动清理关闭的连接
21
+ )
22
+ # 由于网络上有重名,没有连接。如果加入域,请转到“控制面板”中的“系统”更改计算机名,然后重试。如果加入工作组,请选择其他工作组名。
23
+ 有可能是
24
+ force_close=True, # 保持连接活跃
25
+ enable_cleanup_closed=True # 自动清理关闭的连接
26
+ 照成的
27
+ """
28
+
29
+
30
+ class HttpError(Exception):
31
+ code = 0
32
+ message = ""
33
+ headers = None
34
+
35
+ def __init__(
36
+ self,
37
+ *,
38
+ code: Optional[int] = None,
39
+ message: str = "",
40
+ headers: Optional[dict] = None,
41
+ ) -> None:
42
+ if code is not None:
43
+ self.code = code
44
+ self.headers = headers
45
+ self.message = message
46
+
47
+ def __str__(self) -> str:
48
+ return f"code: {self.code}, message:{self.message}"
49
+
50
+ def __repr__(self) -> str:
51
+ return f"<{self.__class__.__name__}: code={self.code}, message={self.message!r}>"
52
+
53
+
54
+ def on_retry_error(retry_state):
55
+ # 最后抛错后调用
56
+ original_exc = retry_state.outcome.exception()
57
+ print(f"[HTTP 请求重试所有重试失败.] 错误消息{original_exc}")
58
+
59
+ raise HttpError(code=getattr(original_exc, 'code', 455),
60
+ message=f"错误消息:{str(original_exc)}") from original_exc
61
+
62
+
63
+ def on_retry(retry_state):
64
+ # 每次抛错进入该函数打印消息
65
+ print(
66
+ f"[HTTP 请求重试]"
67
+ f"当前重试 : 第 {retry_state.attempt_number} 次"
68
+ f"睡眠时间 : {retry_state.next_action.sleep:.2f} 秒"
69
+ f"\n异常原因 : {retry_state.outcome.exception()}"
70
+ )
71
+
72
+
73
+ class ApiNetUtils:
74
+ """
75
+ HTTP请求工具类(异步版),提供GET/POST/PATCH请求方法
76
+ 特性:
77
+ 1. 自动复用TCP连接池
78
+ 2. 自动重试机制(通过async_retry装饰器)
79
+ 3. 进程退出时自动清理资源
80
+ 4. 线程安全的延迟初始化
81
+ """
82
+
83
+ # 类属性使用Optional类型注解,初始化为None实现延迟初始化
84
+ _conn: Optional[aiohttp.TCPConnector] = None
85
+ _session: Optional[aiohttp.ClientSession] = None
86
+ _close_registered: bool = False # 确保清理函数只注册一次
87
+
88
+ @classmethod
89
+ async def _get_connector(cls) -> aiohttp.TCPConnector:
90
+ """
91
+ 获取TCP连接器(延迟初始化)
92
+ 解决模块加载时没有事件循环的问题
93
+ """
94
+ if cls._conn is None or cls._conn.closed or cls.is_loop_closed(cls._session):
95
+ # 只有在首次使用时才创建连接器
96
+ cls._conn = aiohttp.TCPConnector(
97
+ limit=50, # 最大连接数
98
+ ssl=False, # 禁用SSL验证(按需开启)
99
+ force_close=True, # 保持连接活跃
100
+ # enable_cleanup_closed=True, # 自动清理关闭的连接 #
101
+ # keepalive_timeout=4.99 # 比服务器的5s 小一点
102
+ )
103
+ return cls._conn
104
+
105
+ @classmethod
106
+ async def _get_session(cls) -> aiohttp.ClientSession:
107
+ """
108
+ 获取共享会话(线程安全的延迟初始化)
109
+ 包含自动注册清理机制
110
+ """
111
+ if cls._session is None or cls._session.closed or cls.is_loop_closed(cls._session):
112
+ if cls._session:
113
+ await cls.close()
114
+ # 获取连接器(会自动初始化)
115
+ connector = await cls._get_connector()
116
+
117
+ # 强制获取新的事件循环
118
+ loop = asyncio.get_event_loop()
119
+
120
+ # 创建新会话
121
+ cls._session = aiohttp.ClientSession(
122
+ connector=connector,
123
+ timeout=aiohttp.ClientTimeout(total=30), # 默认30秒超时
124
+ loop=loop) # 显式指定事件循环
125
+
126
+ # # 注册退出时的清理钩子
127
+ cls._register_cleanup()
128
+
129
+ return cls._session
130
+
131
+ @staticmethod
132
+ def is_loop_closed(session: aiohttp.ClientSession) -> bool:
133
+ """
134
+ 检查会话绑定的事件循环是否已关闭
135
+ """
136
+ loop = session._loop # 获取会话绑定的事件循环
137
+ if loop.is_closed():
138
+ # print("Event loop is closed")
139
+ return True
140
+ # print("Event loop not is closed")
141
+ return False
142
+
143
+ @classmethod
144
+ def _register_cleanup(cls):
145
+ """
146
+ 注册进程退出时的资源清理函数
147
+ 包含正常退出和异常退出两种情况
148
+ """
149
+ if not cls._close_registered:
150
+ # 1. 正常退出处理
151
+ atexit.register(lambda: asyncio.run(cls.close()))
152
+
153
+ # 2. 异常退出处理
154
+ original_excepthook = sys.excepthook
155
+
156
+ def custom_excepthook(exctype, value, traceback):
157
+ """自定义异常钩子,确保资源被清理"""
158
+ # 先执行原始异常处理(打印堆栈等)
159
+ original_excepthook(exctype, value, traceback)
160
+ # 然后执行资源清理
161
+ try:
162
+ asyncio.run(cls.close())
163
+ except RuntimeError:
164
+ # 如果已经没有事件循环,则同步执行
165
+ loop = asyncio.new_event_loop()
166
+ loop.run_until_complete(cls.close())
167
+ loop.close()
168
+
169
+ sys.excepthook = custom_excepthook
170
+ cls._close_registered = True
171
+
172
+ @classmethod
173
+ async def close(cls):
174
+ """
175
+ 安全关闭所有网络资源
176
+ 会自动在程序退出时调用,也可手动调用
177
+ """
178
+ if cls._session and not cls._session.closed:
179
+ await cls._session.close()
180
+ cls._session = None
181
+
182
+ if cls._conn and not cls._conn.closed:
183
+ await cls._conn.close()
184
+ cls._conn = None
185
+
186
+ # print("[ApiNetUtils] 网络资源已安全释放")
187
+
188
+ # -------------------- 公共API方法 -------------------- #
189
+
190
+ @classmethod
191
+ @retry(stop=stop_after_attempt(4), # 本质上执行4次 但重试3次
192
+ wait=wait_random(min=5, max=15),
193
+ before_sleep=on_retry, # 每次抛错后使用
194
+ retry_error_callback=on_retry_error,
195
+ reraise=True)
196
+ async def fetch_get(cls, url: str, headers=None, params=None):
197
+ """
198
+ GET请求封装
199
+ :param url: 请求URL
200
+ :param headers: 可选请求头(默认使用全局g_headers)
201
+ :param params: 查询参数(字典)
202
+ :return: 解析后的JSON数据
203
+ :raises HttpError: 当状态码非200时抛出
204
+ """
205
+ headers = headers or g_headers
206
+ session = await cls._get_session()
207
+
208
+ async with session.get(url, headers=headers, params=params) as response:
209
+ if response.status != 200:
210
+ error_text = await response.text()
211
+ raise HttpError(
212
+ code=response.status,
213
+ message=f"请求失败: url={url}, status={response.status}, 错误详情={error_text}"
214
+ )
215
+ return await response.json()
216
+
217
+ @classmethod
218
+ @retry(stop=stop_after_attempt(4),
219
+ wait=wait_random(min=5, max=15),
220
+ before_sleep=on_retry, # 每次抛错后使用
221
+ retry_error_callback=on_retry_error,
222
+ reraise=True)
223
+ async def fetch_post(cls, url: str, payload: dict, headers=None):
224
+ """
225
+ POST请求封装(JSON格式)
226
+ """
227
+ headers = headers or g_headers
228
+ session = await cls._get_session()
229
+
230
+ async with session.post(url, json=payload, headers=headers) as response:
231
+ if response.status != 200:
232
+ error_text = await response.text()
233
+ raise HttpError(
234
+ code=response.status,
235
+ message=f"请求失败: url={url}, status={response.status}, 错误详情={error_text}"
236
+ )
237
+ return await response.json()
238
+
239
+ @classmethod
240
+ @retry(stop=stop_after_attempt(4),
241
+ wait=wait_random(min=5, max=15),
242
+ before_sleep=on_retry, # 每次抛错后使用
243
+ retry_error_callback=on_retry_error,
244
+ reraise=True)
245
+ async def fetch_patch(cls, url: str, payload: dict, headers=None):
246
+ """
247
+ PATCH请求封装(JSON格式)
248
+ """
249
+ headers = headers or g_headers
250
+ session = await cls._get_session()
251
+
252
+ async with session.patch(url, json=payload, headers=headers) as response:
253
+ if response.status != 200:
254
+ error_text = await response.text()
255
+ raise HttpError(
256
+ code=response.status,
257
+ message=f"请求失败: url={url}, status={response.status}, 错误详情={error_text}"
258
+ )
259
+ return await response.json()
260
+
261
+ @classmethod
262
+ async def __aenter__(cls):
263
+ """支持async with语法"""
264
+ await cls._get_session()
265
+ return cls
266
+
267
+ @classmethod
268
+ async def __aexit__(cls, exc_type, exc, tb):
269
+ """async with退出时自动关闭"""
270
+ await cls.close()
@@ -1,8 +1,9 @@
1
+ import atexit
2
+ import os
1
3
  from contextlib import asynccontextmanager
2
4
  from typing import AsyncGenerator, Tuple
3
5
 
4
- import aiomysql
5
- from aiomysql import Pool, Connection, Cursor, DictCursor
6
+ from aiomysql import Pool, Connection, Cursor
6
7
 
7
8
  DB_CONFIG = {
8
9
  'host': '192.168.98.55',
@@ -18,10 +19,27 @@ DB_CONFIG = {
18
19
  'echo': False, # 打印SQL语句
19
20
  }
20
21
 
22
+ DB_CONFIG1 = {
23
+ 'host': '192.168.98.55',
24
+ 'port': 4000,
25
+ 'user': 'foreign_fulltextUser',
26
+ 'password': 'i4hIeasw1qpmhGN2nwL7',
27
+ 'db': 'foreign_fulltext',
28
+ 'charset': 'utf8mb4',
29
+ 'minsize': 16, # 最小连接数
30
+ 'maxsize': 128, # 最大连接数
31
+ 'autocommit': False, # 自动提交事务
32
+ 'pool_recycle': 3600, # 每个连接的回收时间(秒),超过此时间后连接将被关闭并重新创建,避免失效连接
33
+ 'echo': False, # 打印SQL语句
34
+ }
35
+
21
36
 
22
37
  @asynccontextmanager
23
- async def get_db_pool():
38
+ async def get_db_pool(_DB_CONFIG: dict = None):
24
39
  """异步数据库连接池管理工具"""
40
+ global DB_CONFIG
41
+ if _DB_CONFIG is not None:
42
+ DB_CONFIG = _DB_CONFIG
25
43
  pool: Pool = await aiomysql.create_pool(**DB_CONFIG)
26
44
  try:
27
45
  yield pool
@@ -36,3 +54,21 @@ async def get_session(pool: Pool) -> AsyncGenerator[Tuple[Connection, Cursor], N
36
54
  async with pool.acquire() as conn:
37
55
  async with conn.cursor() as cursor:
38
56
  yield conn, cursor
57
+
58
+
59
+ # main.py
60
+ import aiomysql
61
+ import asyncio
62
+
63
+ aiomysql_pool = None
64
+ pool_lock = asyncio.Lock() # 全局异步锁
65
+
66
+
67
+ async def init_aiomysql_pool_async():
68
+ global aiomysql_pool
69
+ if aiomysql_pool is None:
70
+ async with pool_lock:
71
+ if aiomysql_pool is None:
72
+ print(f"[{os.getpid()}] Initializing aiomysql pool...")
73
+ aiomysql_pool = await aiomysql.create_pool(**DB_CONFIG)
74
+ return aiomysql_pool
@@ -9,7 +9,7 @@ from re_common.v2.baselibrary.utils.stringutils import (
9
9
  bj2qj,
10
10
  get_diacritic_variant,
11
11
  clean_html,
12
- remove_spaces_between_chinese_characters,
12
+ remove_spaces_between_chinese_characters, clean_unicode_alnum,
13
13
  )
14
14
 
15
15
 
@@ -91,6 +91,11 @@ class StringClear(object):
91
91
  self.obj_str = re.sub(r"[^\w\s]", "", self.obj_str)
92
92
  return self
93
93
 
94
+ def remove_all_symbols(self):
95
+ # 一种更加强力的符号清理 只保留各个国家的字符 和各个国家的数字
96
+ self.obj_str = clean_unicode_alnum(self.obj_str)
97
+ return self
98
+
94
99
  def remove_underline(self):
95
100
  # 下划线在 \w 中 所以这里独立封装
96
101
  self.obj_str = re.sub("[_]", "", self.obj_str)
@@ -211,3 +211,18 @@ def get_group_abstract(lists):
211
211
  t_list.append(keyid_list[text_idx])
212
212
  all_list.append(t_list)
213
213
  return all_list
214
+
215
+
216
+ def clean_unicode_alnum(text: str) -> str:
217
+ """
218
+ 清除所有非 Unicode 字母或数字的字符。
219
+
220
+ 参数:
221
+ text (str): 输入文本。
222
+
223
+ 返回:
224
+ str: 只包含 Unicode 字母和数字的文本。
225
+ \p{N} 匹配所有 Unicode 数字字符 包括非阿拉伯数字字符
226
+ \p{L} 匹配所有语言字符
227
+ """
228
+ return regex.sub(r"[^\p{L}\p{N}]+", "", text)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 10.0.24
3
+ Version: 10.0.26
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic
@@ -163,7 +163,7 @@ re_common/studio/streamlitstudio/first_app.py,sha256=t7Fw8YDlub7G9q99GgVo_3sPZXU
163
163
  re_common/studio/streamlitstudio/uber_pickups.py,sha256=cvrV5e8vRBM2_CpVDBE-f3V4mGFK9SqpRPZK8TEqr6U,785
164
164
  re_common/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
165
165
  re_common/v2/baselibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
166
- re_common/v2/baselibrary/business_utils/BusinessStringUtil.py,sha256=M3Pb9TB9QRePa9phjqx6JZV_igM-xRRa554iZ-WgyOo,6786
166
+ re_common/v2/baselibrary/business_utils/BusinessStringUtil.py,sha256=njPcRgeBWpnZr5u2cPAO4qdWBq-CgTn99rJuvWFcChk,6788
167
167
  re_common/v2/baselibrary/business_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
168
168
  re_common/v2/baselibrary/business_utils/rel_tools.py,sha256=LfnGFCkUSxg1SHvOMOQdP1PiHxIKqk7Syuk5YYpjJag,295
169
169
  re_common/v2/baselibrary/decorators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -174,29 +174,35 @@ re_common/v2/baselibrary/s3object/baseboto3.py,sha256=mXuIFx99pnrPGQ4LJCZwlN1HLb
174
174
  re_common/v2/baselibrary/tools/WeChatRobot.py,sha256=sKBt2gPsfj0gzV6KaLSAhIhL-j3qNfHfqE-lII1LVwM,3537
175
175
  re_common/v2/baselibrary/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
176
176
  re_common/v2/baselibrary/tools/ac_ahocorasick.py,sha256=c63y5RtKVLD37nyPCnBqfNygwRj4gTQqyIdDOrC65G0,2847
177
- re_common/v2/baselibrary/tools/dict_tools.py,sha256=BTh7oJuJ619IZgxiYlim0ltrXBclDtb7WzyFGr7wVf0,1246
177
+ re_common/v2/baselibrary/tools/dict_tools.py,sha256=eSMwPTLp3oSjuviC_wlXg0I-dnkkmZfUfCRLX5djWV8,1365
178
178
  re_common/v2/baselibrary/tools/dolphinscheduler.py,sha256=1m7UGYDiuvJUCI6ik6CGM2fO8U5XteJzn55VRbwB9ts,7978
179
179
  re_common/v2/baselibrary/tools/hdfs_data_processer.py,sha256=g0DaNjXM1hIUblFQ6YBwnwEBKIXn48X8Y9Eiok4dVlQ,14824
180
- re_common/v2/baselibrary/tools/list_tools.py,sha256=mZyrOGdW6tuany0lKQOD4P739xikvmeKm1VSzo37Byc,1973
180
+ re_common/v2/baselibrary/tools/list_tools.py,sha256=1NxGVM4EytSXh4IGAEfZQnvq0Ev-UOF-PGZBg2EQbOg,2132
181
181
  re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=2ENLtZE8opRsfkwRtTNMzITmpTsjO7wZ1ZkfkqpOH9U,1937
182
182
  re_common/v2/baselibrary/tools/text_matcher.py,sha256=cPMoFxaA0-ce3tLRxVSs8_3pTYS1oVIHDnNy_AlPU-4,10756
183
183
  re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
184
+ re_common/v2/baselibrary/tools/data_processer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
185
+ re_common/v2/baselibrary/tools/data_processer/base.py,sha256=i6HA2UQsSRZaKxW1wJMpiC9LAy3wYaI2BVxUAiFoRZ4,1704
186
+ re_common/v2/baselibrary/tools/data_processer/data_processer.py,sha256=R7zHQG8eo3mfckYr-Pp53fyyQj6zd8fuweSxwzvDgN0,22683
187
+ re_common/v2/baselibrary/tools/data_processer/data_reader.py,sha256=LWLbom7W2L0T6q38crA1_Gcvxkzk9Lm0btJjrmtMHMU,7945
188
+ re_common/v2/baselibrary/tools/data_processer/data_writer.py,sha256=OgKZ06zRJYNx758rbjxZG_KNgkLuVLlyB1AvyRsJtS4,1447
184
189
  re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=njPcRgeBWpnZr5u2cPAO4qdWBq-CgTn99rJuvWFcChk,6788
185
190
  re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
191
+ re_common/v2/baselibrary/utils/api_net_utils.py,sha256=22q3WMWiKVg1IVGr4y2D1JrjhnbQtlChRDJm2S8rGlc,9868
186
192
  re_common/v2/baselibrary/utils/author_smi.py,sha256=1ebH3AHv19jtJWdlqNdwu6t58HNVLCotuCB6ff1SWiw,13666
187
193
  re_common/v2/baselibrary/utils/base_string_similarity.py,sha256=a40a79ttwoG_gC_hxMNB-sMXXecgICoRDWrj0DW8iEE,7749
188
194
  re_common/v2/baselibrary/utils/basedict.py,sha256=sH3_RZ8u4649-jX2V1uKNNkjJVUijZBDp6SdqncOZ88,1583
189
195
  re_common/v2/baselibrary/utils/basehdfs.py,sha256=TPwFct_-UrmO1KCbo4gpV77rsnlCQDumNBbQKL0ZI9o,5953
190
196
  re_common/v2/baselibrary/utils/basepika.py,sha256=ifOb3UsGj79k40aD9UK6-5BMPw43ZAo0SO3AYD4q4vw,7332
191
197
  re_common/v2/baselibrary/utils/basetime.py,sha256=b7U_ho6nE3fjYBxSkdMHXUOd3ClH6KkW_7p7l2Gs4gA,3038
192
- re_common/v2/baselibrary/utils/db.py,sha256=lr6SI1Bk7tA0nw_4vHbmz-OaisPlb_p9ziC1Oz74tcA,1216
198
+ re_common/v2/baselibrary/utils/db.py,sha256=ouDagXqqY9h4ucK4LDGrYVY-31rOiBQFxXLIlio9AJA,2297
193
199
  re_common/v2/baselibrary/utils/json_cls.py,sha256=M93piYtmgm_wP8E57culTrd_AhHLoGg6PqeAJYdW2SM,438
194
200
  re_common/v2/baselibrary/utils/mq.py,sha256=UHpO8iNIHs91Tgp-BgnSUpZwjWquxrGLdpr3FMMv2zw,2858
195
201
  re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
196
202
  re_common/v2/baselibrary/utils/string_bool.py,sha256=vxnjSFOfuHWGxkqaIbUNn21opx5tfV1uCXSahFfp1mU,6197
197
- re_common/v2/baselibrary/utils/string_clear.py,sha256=_bK8oi7t34JfFh8IVbe0fFexBzafFWow0r3QJ45HsyE,7212
203
+ re_common/v2/baselibrary/utils/string_clear.py,sha256=ywYR1KrKQyeM-zJgvTmORlfgbLdRSjWWKPe7K8oRx_8,7450
198
204
  re_common/v2/baselibrary/utils/string_smi.py,sha256=cU0WAWHRGnGoVQx3eCEKeM_q_olFNzRTJe7rSe586SY,741
199
- re_common/v2/baselibrary/utils/stringutils.py,sha256=WuxhXJVU6xuGfgHiSjxrn7Go1eobpa8DMR3Icoey4vo,6039
205
+ re_common/v2/baselibrary/utils/stringutils.py,sha256=TI6fw3km1l25ufXrnG6ha8dSBDtRh-MF4nWRt9u8Xbo,6452
200
206
  re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
201
207
  re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
202
208
  re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
@@ -223,8 +229,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
223
229
  re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
224
230
  re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
225
231
  re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
226
- re_common-10.0.24.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
227
- re_common-10.0.24.dist-info/METADATA,sha256=R1kT5A-yNJHQ3lvpchpqc3dX8ONJK1kuTfoGUQ08018,582
228
- re_common-10.0.24.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
229
- re_common-10.0.24.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
230
- re_common-10.0.24.dist-info/RECORD,,
232
+ re_common-10.0.26.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
233
+ re_common-10.0.26.dist-info/METADATA,sha256=kHLVPF-e0PjpnUL7dN9pAMqK_pw4yHwZGKxbJ_zlAY0,582
234
+ re_common-10.0.26.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
235
+ re_common-10.0.26.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
236
+ re_common-10.0.26.dist-info/RECORD,,