re-common 10.0.24__py3-none-any.whl → 10.0.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +1 -0
- re_common/v2/baselibrary/tools/data_processer/__init__.py +0 -0
- re_common/v2/baselibrary/tools/data_processer/base.py +53 -0
- re_common/v2/baselibrary/tools/data_processer/data_processer.py +508 -0
- re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -0
- re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -0
- re_common/v2/baselibrary/tools/dict_tools.py +7 -0
- re_common/v2/baselibrary/tools/list_tools.py +5 -1
- re_common/v2/baselibrary/utils/api_net_utils.py +270 -0
- re_common/v2/baselibrary/utils/db.py +39 -3
- re_common/v2/baselibrary/utils/string_clear.py +6 -1
- re_common/v2/baselibrary/utils/stringutils.py +15 -0
- {re_common-10.0.24.dist-info → re_common-10.0.26.dist-info}/METADATA +1 -1
- {re_common-10.0.24.dist-info → re_common-10.0.26.dist-info}/RECORD +17 -11
- {re_common-10.0.24.dist-info → re_common-10.0.26.dist-info}/LICENSE +0 -0
- {re_common-10.0.24.dist-info → re_common-10.0.26.dist-info}/WHEEL +0 -0
- {re_common-10.0.24.dist-info → re_common-10.0.26.dist-info}/top_level.txt +0 -0
|
File without changes
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List, Generator
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BaseFileReader(ABC):
|
|
6
|
+
|
|
7
|
+
def __init__(self, batch_size: int = 10000):
|
|
8
|
+
self.batch_size = batch_size
|
|
9
|
+
self.read_model = 1
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def list_files(self, path: str) -> List[str]:
|
|
13
|
+
"""列出路径下所有目标文件"""
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def count_lines(self, file_path: str) -> int:
|
|
18
|
+
"""统计文件行数"""
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
|
|
23
|
+
"""读取文件内容,返回批量数据"""
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def read_all(self, file_path: str) -> List[List[str]]:
|
|
28
|
+
"""读取整个文件,默认按1000行分批"""
|
|
29
|
+
return [line for line in self.read_lines(file_path)]
|
|
30
|
+
|
|
31
|
+
def read_select(self, file_path: str) -> Generator[List[str], None, None]:
|
|
32
|
+
if self.read_model == 1:
|
|
33
|
+
for batch_data in self.read_lines(file_path):
|
|
34
|
+
yield batch_data
|
|
35
|
+
elif self.read_model == 2:
|
|
36
|
+
for batch_data in self.read_all(file_path):
|
|
37
|
+
yield batch_data
|
|
38
|
+
else:
|
|
39
|
+
raise Exception("模式选择错误")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class BaseFileWriter(ABC):
|
|
43
|
+
|
|
44
|
+
def __init__(self, file_path: str, compress: bool = True, overwrite: bool = True, encoding: str = "utf-8"):
|
|
45
|
+
self.file_path = file_path
|
|
46
|
+
self.compress = compress
|
|
47
|
+
self.encoding = encoding
|
|
48
|
+
self.overwrite = overwrite
|
|
49
|
+
|
|
50
|
+
@abstractmethod
|
|
51
|
+
def write_lines(self, lines: List[str], file_path: str):
|
|
52
|
+
"""写入多行文本到文件,支持压缩"""
|
|
53
|
+
pass
|
|
@@ -0,0 +1,508 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import sqlite3
|
|
5
|
+
import time
|
|
6
|
+
import traceback
|
|
7
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List, Callable, Any
|
|
10
|
+
|
|
11
|
+
from filelock import FileLock
|
|
12
|
+
from tenacity import retry, stop_after_attempt, wait_exponential, wait_random, retry_if_result
|
|
13
|
+
|
|
14
|
+
from re_common.v2.baselibrary.tools.data_processer.base import BaseFileReader, BaseFileWriter
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DatabaseHandler:
|
|
18
|
+
def __init__(self, db_file="processed_files.db"):
|
|
19
|
+
self.db_file = db_file
|
|
20
|
+
self.lock_file = f"{self.db_file}.lock"
|
|
21
|
+
self._init_db()
|
|
22
|
+
|
|
23
|
+
def _init_db(self):
|
|
24
|
+
with FileLock(self.lock_file):
|
|
25
|
+
"""初始化 SQLite 数据库"""
|
|
26
|
+
with sqlite3.connect(self.db_file) as conn:
|
|
27
|
+
cursor = conn.cursor()
|
|
28
|
+
cursor.execute("""
|
|
29
|
+
CREATE TABLE IF NOT EXISTS processed_files (
|
|
30
|
+
file_path TEXT PRIMARY KEY
|
|
31
|
+
)
|
|
32
|
+
""")
|
|
33
|
+
conn.commit()
|
|
34
|
+
|
|
35
|
+
def save_processed_file(self, file_path):
|
|
36
|
+
"""保存处理过的文件"""
|
|
37
|
+
with FileLock(self.lock_file):
|
|
38
|
+
with sqlite3.connect(self.db_file) as conn:
|
|
39
|
+
cursor = conn.cursor()
|
|
40
|
+
cursor.execute(
|
|
41
|
+
"INSERT OR IGNORE INTO processed_files (file_path) VALUES (?)",
|
|
42
|
+
(file_path,)
|
|
43
|
+
)
|
|
44
|
+
conn.commit()
|
|
45
|
+
|
|
46
|
+
def save_processed_files_many(self, file_paths):
|
|
47
|
+
"""批量保存处理过的文件路径"""
|
|
48
|
+
if not file_paths:
|
|
49
|
+
return
|
|
50
|
+
with FileLock(self.lock_file):
|
|
51
|
+
with sqlite3.connect(self.db_file) as conn:
|
|
52
|
+
cursor = conn.cursor()
|
|
53
|
+
cursor.executemany(
|
|
54
|
+
"INSERT OR IGNORE INTO processed_files (file_path) VALUES (?)",
|
|
55
|
+
((fp,) for fp in file_paths)
|
|
56
|
+
)
|
|
57
|
+
conn.commit()
|
|
58
|
+
|
|
59
|
+
def is_file_processed(self, file_path):
|
|
60
|
+
"""检查文件是否已处理"""
|
|
61
|
+
with FileLock(self.lock_file):
|
|
62
|
+
with sqlite3.connect(self.db_file) as conn:
|
|
63
|
+
cursor = conn.cursor()
|
|
64
|
+
cursor.execute(
|
|
65
|
+
"SELECT file_path FROM processed_files WHERE file_path = ?",
|
|
66
|
+
(file_path,)
|
|
67
|
+
)
|
|
68
|
+
result = cursor.fetchone()
|
|
69
|
+
return result is not None
|
|
70
|
+
|
|
71
|
+
def fake_processed_files(self, start_index, end_index, file_list):
|
|
72
|
+
try:
|
|
73
|
+
# 将字符串序号转换为整数
|
|
74
|
+
start = int(start_index)
|
|
75
|
+
end = int(end_index)
|
|
76
|
+
|
|
77
|
+
# 验证序号范围
|
|
78
|
+
if start >= end:
|
|
79
|
+
raise ValueError(f"起始序号 {start_index} 必须小于结束序号 {end_index}")
|
|
80
|
+
|
|
81
|
+
list_formatted_num = []
|
|
82
|
+
# 为范围内的每个序号生成文件名
|
|
83
|
+
for num in range(start, end):
|
|
84
|
+
# 将序号格式化为5位字符串 (00120, 00121,...)
|
|
85
|
+
formatted_num = f"{num:05d}"
|
|
86
|
+
list_formatted_num.append(formatted_num)
|
|
87
|
+
|
|
88
|
+
skip_path_list = []
|
|
89
|
+
skip_formatted_num = []
|
|
90
|
+
for file_path in file_list:
|
|
91
|
+
re_f_num = re.findall(r'(?<!\d)\d{5}(?!\d)', str(Path(file_path).stem))
|
|
92
|
+
if re_f_num:
|
|
93
|
+
if re_f_num[0] in list_formatted_num:
|
|
94
|
+
skip_path_list.append(file_path)
|
|
95
|
+
skip_formatted_num.append(re_f_num[0])
|
|
96
|
+
|
|
97
|
+
for item_list in [skip_path_list[i:i + 2000] for i in range(0, len(skip_path_list), 2000)]:
|
|
98
|
+
self.save_processed_files_many(item_list)
|
|
99
|
+
for file_path in item_list:
|
|
100
|
+
print(f"伪造处理记录: {file_path}")
|
|
101
|
+
|
|
102
|
+
no_fil_num = set(list_formatted_num) - set(skip_formatted_num)
|
|
103
|
+
if len(no_fil_num) > 0:
|
|
104
|
+
print(f"没有对应num的文件,伪造失败数量为{len(no_fil_num)},样例:{list(no_fil_num)[:10]}")
|
|
105
|
+
print(f"成功伪造处理记录:序号 {start_index} 到 {end_index}(不含)的文件")
|
|
106
|
+
|
|
107
|
+
except ValueError as e:
|
|
108
|
+
print(f"错误: 序号格式无效 - {str(e)}")
|
|
109
|
+
except Exception as e:
|
|
110
|
+
print(f"伪造处理记录时出错: {str(e)}")
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class DataProcessor:
|
|
114
|
+
def __init__(
|
|
115
|
+
self,
|
|
116
|
+
reader: BaseFileReader,
|
|
117
|
+
writer: BaseFileWriter = None,
|
|
118
|
+
db_handler: DatabaseHandler = None,
|
|
119
|
+
db_file="processed_files.db",
|
|
120
|
+
batch_size=50,
|
|
121
|
+
retry_limit=3,
|
|
122
|
+
):
|
|
123
|
+
self.reader = reader
|
|
124
|
+
self.writer = writer
|
|
125
|
+
self.db_file = db_file
|
|
126
|
+
self.batch_size = batch_size
|
|
127
|
+
self.retry_limit = retry_limit
|
|
128
|
+
self.db_handler = db_handler if db_handler else DatabaseHandler(db_file=db_file)
|
|
129
|
+
|
|
130
|
+
async def retry_process_data(self, data, process_func):
|
|
131
|
+
"""处理数据并执行处理函数"""
|
|
132
|
+
|
|
133
|
+
def on_retry(retry_state):
|
|
134
|
+
# 每次抛错进入该函数打印消息
|
|
135
|
+
print(
|
|
136
|
+
f"重试次数: {retry_state.attempt_number}/{self.retry_limit},数据内容: {retry_state.args[0]}\n"
|
|
137
|
+
f"异常信息: {retry_state.outcome.exception()}"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
def on_retry_error(retry_state):
|
|
141
|
+
# 最后抛错后调用
|
|
142
|
+
original_exc = retry_state.outcome.exception()
|
|
143
|
+
raise RuntimeError(
|
|
144
|
+
f"处理数据失败,达到重试上限。data: {retry_state.args[0]}") from original_exc # 抛出的自定义异常中 保留 __process_func() 里的原始错误堆栈信息(traceback)
|
|
145
|
+
|
|
146
|
+
@retry(stop=stop_after_attempt(3),
|
|
147
|
+
wait=wait_exponential(multiplier=1, min=2, max=20),
|
|
148
|
+
before_sleep=on_retry, # 每次抛错后使用
|
|
149
|
+
retry_error_callback=on_retry_error, # 如果到最后都没有成功 抛错
|
|
150
|
+
reraise=True)
|
|
151
|
+
async def __process_func(_data):
|
|
152
|
+
return await process_func(_data)
|
|
153
|
+
|
|
154
|
+
return await __process_func(data)
|
|
155
|
+
|
|
156
|
+
async def process_file(self, hdfs_file_path, process_func, write_dir):
|
|
157
|
+
"""处理单个 gz 文件"""
|
|
158
|
+
total_lines = self.reader.count_lines(hdfs_file_path)
|
|
159
|
+
processed_lines = 0
|
|
160
|
+
start_time = time.time()
|
|
161
|
+
results = []
|
|
162
|
+
# # 这里根据不同的配置选用不同的读取文件的方法
|
|
163
|
+
for lines in self.reader.read_select(hdfs_file_path):
|
|
164
|
+
processing_start_time = time.time() # 记录本批处理开始时间
|
|
165
|
+
|
|
166
|
+
tasks = []
|
|
167
|
+
for line in lines:
|
|
168
|
+
# try:
|
|
169
|
+
# data = json.loads(line)
|
|
170
|
+
# tasks.append(self.retry_process_data(data, process_func))
|
|
171
|
+
# except json.JSONDecodeError as e:
|
|
172
|
+
# raise Exception(f"解析JSON失败: {e}, 行内容: {line.strip()}")
|
|
173
|
+
tasks.append(self.retry_process_data(line, process_func))
|
|
174
|
+
|
|
175
|
+
# await AsyncTaskPool(self.batch_size).run(tasks) # AsyncTaskPool 适用于一次提交所有任务, 限制并发数执行
|
|
176
|
+
results.extend(await asyncio.gather(*tasks))
|
|
177
|
+
|
|
178
|
+
processed_lines += len(lines)
|
|
179
|
+
|
|
180
|
+
elapsed_time = time.time() - start_time # 已用时间
|
|
181
|
+
processing_time = time.time() - processing_start_time # 本次处理时间
|
|
182
|
+
avg_processing_time = (
|
|
183
|
+
(elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
|
|
184
|
+
) # 平均每条数据的处理时间(毫秒)
|
|
185
|
+
|
|
186
|
+
# 估算剩余时间
|
|
187
|
+
remaining_time = (
|
|
188
|
+
((avg_processing_time / 1000) * (total_lines - processed_lines))
|
|
189
|
+
if processed_lines > 0
|
|
190
|
+
else float("inf")
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# 显示总进度信息
|
|
194
|
+
print(
|
|
195
|
+
f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
|
|
196
|
+
f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
|
|
197
|
+
f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
if write_dir is not None:
|
|
201
|
+
if not self.writer:
|
|
202
|
+
raise Exception("没有配置写数据的对象")
|
|
203
|
+
write_path = write_dir.rstrip("/") + f"/{Path(hdfs_file_path).stem}"
|
|
204
|
+
self.writer.write_lines([str(item) for item in results], write_path)
|
|
205
|
+
|
|
206
|
+
# 最终进度显示
|
|
207
|
+
final_elapsed_time = time.time() - start_time # 最终已用时间
|
|
208
|
+
print(
|
|
209
|
+
f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
|
|
210
|
+
f"总已用时间: {final_elapsed_time:.2f}秒 | "
|
|
211
|
+
f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
|
|
212
|
+
if processed_lines > 0
|
|
213
|
+
else "处理无数据"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
self.db_handler.save_processed_file(hdfs_file_path) # 保存处理过的文件
|
|
217
|
+
|
|
218
|
+
async def retry_process_file(self, hdfs_file_path, process_func, write_dir):
|
|
219
|
+
"""带重试机制的文件处理"""
|
|
220
|
+
|
|
221
|
+
def on_retry(retry_state):
|
|
222
|
+
# 每次抛错进入该函数打印消息
|
|
223
|
+
exc = retry_state.outcome.exception()
|
|
224
|
+
tb = ''.join(traceback.format_exception(type(exc), exc, exc.__traceback__))
|
|
225
|
+
print(tb)
|
|
226
|
+
|
|
227
|
+
print(
|
|
228
|
+
f"处理文件 {retry_state.args[0]} 时发生错误: {exc},正在重试 {retry_state.attempt_number}/{self.retry_limit}")
|
|
229
|
+
|
|
230
|
+
def on_retry_error(retry_state):
|
|
231
|
+
# 最后抛错后调用
|
|
232
|
+
print(f"处理文件 {retry_state.args[0]} 失败,达到重试上限")
|
|
233
|
+
return False
|
|
234
|
+
|
|
235
|
+
@retry(stop=stop_after_attempt(3),
|
|
236
|
+
wait=wait_exponential(multiplier=1, min=2, max=20),
|
|
237
|
+
before_sleep=on_retry, # 每次抛错后使用
|
|
238
|
+
retry_error_callback=on_retry_error, # 如果最后没成功 返回 False
|
|
239
|
+
reraise=True)
|
|
240
|
+
async def __process_func(_hdfs_file_path, _process_func, _write_dir):
|
|
241
|
+
await self.process_file(_hdfs_file_path, _process_func, _write_dir)
|
|
242
|
+
return True # 成功处理后退出
|
|
243
|
+
|
|
244
|
+
return await __process_func(hdfs_file_path, process_func, write_dir)
|
|
245
|
+
|
|
246
|
+
def get_file_list(self, hdfs_dir):
|
|
247
|
+
# 获取所有任务文件
|
|
248
|
+
all_files = self.reader.list_files(hdfs_dir)
|
|
249
|
+
for file_path in all_files:
|
|
250
|
+
yield file_path
|
|
251
|
+
|
|
252
|
+
@retry(stop=stop_after_attempt(3),
|
|
253
|
+
wait=wait_random(min=10, max=30),
|
|
254
|
+
retry=retry_if_result(lambda result: not result), # 如果返回值是 False(失败),则重试 最后会抛出一个默认错误tenacity.RetryError:
|
|
255
|
+
reraise=True)
|
|
256
|
+
async def _batch_process_file(self, hdfs_file_path: str, process_func: Callable[[str], Any],
|
|
257
|
+
write_dir: str = None):
|
|
258
|
+
"""批量更新所有 gz 文件"""
|
|
259
|
+
# all_succeed = True
|
|
260
|
+
# for hdfs_file_path in self.get_file_list(hdfs_dir):
|
|
261
|
+
# if self.db_handler.is_file_processed(hdfs_file_path):
|
|
262
|
+
# print(f"跳过已处理文件: {hdfs_file_path}")
|
|
263
|
+
# continue # 如果文件已处理,跳过
|
|
264
|
+
# succeed = await self.retry_process_file(hdfs_file_path, process_func, write_dir) # 处理文件
|
|
265
|
+
# if succeed is False:
|
|
266
|
+
# all_succeed = False
|
|
267
|
+
#
|
|
268
|
+
# if all_succeed:
|
|
269
|
+
# # 处理完成后删除数据库文件
|
|
270
|
+
# try:
|
|
271
|
+
# if os.path.exists(self.db_file):
|
|
272
|
+
# os.remove(self.db_file)
|
|
273
|
+
# print(f"已删除断点重试文件: {self.db_file}")
|
|
274
|
+
# return True
|
|
275
|
+
# except Exception as e:
|
|
276
|
+
# print(f"删除断点重试文件失败: {e}")
|
|
277
|
+
# return False
|
|
278
|
+
if self.db_handler.is_file_processed(hdfs_file_path):
|
|
279
|
+
print(f"跳过已处理文件: {hdfs_file_path}")
|
|
280
|
+
return True # 如果文件已处理,跳过
|
|
281
|
+
succeed = await self.retry_process_file(hdfs_file_path, process_func, write_dir) # 处理文件
|
|
282
|
+
return succeed
|
|
283
|
+
|
|
284
|
+
async def process_file_bulk(self, hdfs_file_path, process_func, write_dir):
|
|
285
|
+
"""按批次处理单个文件,批量数据传递给处理函数"""
|
|
286
|
+
# 获取文件的数据总量
|
|
287
|
+
total_lines = self.reader.count_lines(hdfs_file_path)
|
|
288
|
+
processed_lines = 0
|
|
289
|
+
start_time = time.time()
|
|
290
|
+
|
|
291
|
+
results = []
|
|
292
|
+
tasks = []
|
|
293
|
+
# 这里根据不同的配置选用不同的读取文件的方法
|
|
294
|
+
for lines in self.reader.read_select(hdfs_file_path):
|
|
295
|
+
processing_start_time = time.time() # 记录本批处理开始时间
|
|
296
|
+
|
|
297
|
+
# batch_data = []
|
|
298
|
+
# for line in lines:
|
|
299
|
+
# try:
|
|
300
|
+
# data = json.loads(line)
|
|
301
|
+
# batch_data.append(data)
|
|
302
|
+
# except json.JSONDecodeError as e:
|
|
303
|
+
# raise Exception(f"解析JSON失败: {e}, 行内容: {line.strip()}")
|
|
304
|
+
|
|
305
|
+
# 处理读取到的批次数据
|
|
306
|
+
if lines:
|
|
307
|
+
tasks.append(process_func(lines)) # 将批次数据传递给处理函数并收集任务
|
|
308
|
+
processed_lines += len(lines) # 更新已处理行数
|
|
309
|
+
|
|
310
|
+
# 当积累的任务数量达到 batch_size 时并发处理所有任务
|
|
311
|
+
if len(tasks) >= self.batch_size:
|
|
312
|
+
results.extend(await asyncio.gather(*tasks))
|
|
313
|
+
elapsed_time = time.time() - start_time # 已用时间
|
|
314
|
+
processing_time = time.time() - processing_start_time # 本次处理时间
|
|
315
|
+
avg_processing_time = (
|
|
316
|
+
(elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
|
|
317
|
+
) # 平均每条数据的处理时间(毫秒)
|
|
318
|
+
|
|
319
|
+
# 估算剩余时间
|
|
320
|
+
remaining_time = (
|
|
321
|
+
((avg_processing_time / 1000) * (total_lines - processed_lines))
|
|
322
|
+
if processed_lines > 0
|
|
323
|
+
else float("inf")
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
# 显示总进度信息
|
|
327
|
+
print(
|
|
328
|
+
f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
|
|
329
|
+
f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
|
|
330
|
+
f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
# 清空任务列表,准备下一批处理
|
|
334
|
+
tasks.clear()
|
|
335
|
+
# 处理剩余的任务
|
|
336
|
+
if tasks:
|
|
337
|
+
results.extend(await asyncio.gather(*tasks)) # 处理未达到 batch_size 的剩余任务
|
|
338
|
+
|
|
339
|
+
if write_dir is not None:
|
|
340
|
+
if not self.writer:
|
|
341
|
+
raise Exception("没有配置写数据的对象")
|
|
342
|
+
write_path = write_dir.rstrip("/") + f"/{Path(hdfs_file_path).stem}"
|
|
343
|
+
self.writer.write_lines([str(item) for items in results for item in items], write_path)
|
|
344
|
+
|
|
345
|
+
# 最终进度显示
|
|
346
|
+
final_elapsed_time = time.time() - start_time # 最终已用时间
|
|
347
|
+
print(
|
|
348
|
+
f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
|
|
349
|
+
f"总已用时间: {final_elapsed_time:.2f}秒 | "
|
|
350
|
+
f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
|
|
351
|
+
if processed_lines > 0
|
|
352
|
+
else "处理无数据"
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
self.db_handler.save_processed_file(hdfs_file_path)
|
|
356
|
+
|
|
357
|
+
async def retry_process_file_bulk(self, hdfs_file_path, process_func, write_dir):
|
|
358
|
+
"""带重试机制的批量文件处理"""
|
|
359
|
+
|
|
360
|
+
def on_retry(retry_state):
|
|
361
|
+
# 每次抛错进入该函数打印消息
|
|
362
|
+
exc = retry_state.outcome.exception()
|
|
363
|
+
tb = ''.join(traceback.format_exception(type(exc), exc, exc.__traceback__))
|
|
364
|
+
print(tb)
|
|
365
|
+
print(
|
|
366
|
+
f"处理文件 {retry_state.args[0]} 时发生错误: {exc},正在重试 {retry_state.attempt_number}/{self.retry_limit}")
|
|
367
|
+
|
|
368
|
+
def on_retry_error(retry_state):
|
|
369
|
+
# 最后抛错后调用
|
|
370
|
+
print(f"处理文件 {retry_state.args[0]} 失败,达到重试上限")
|
|
371
|
+
return False
|
|
372
|
+
|
|
373
|
+
@retry(stop=stop_after_attempt(3),
|
|
374
|
+
wait=wait_exponential(multiplier=1, min=2, max=20),
|
|
375
|
+
before_sleep=on_retry, # 每次抛错后使用
|
|
376
|
+
retry_error_callback=on_retry_error, # 如果最后没成功 返回 False
|
|
377
|
+
reraise=True)
|
|
378
|
+
async def __process_func(_hdfs_file_path, _process_func, write_dir):
|
|
379
|
+
await self.process_file_bulk(_hdfs_file_path, _process_func, write_dir)
|
|
380
|
+
return True # 成功处理后退出
|
|
381
|
+
|
|
382
|
+
return await __process_func(hdfs_file_path, process_func, write_dir)
|
|
383
|
+
|
|
384
|
+
async def batch_process_file(self, hdfs_dir: str, process_func: Callable[[List[str]], Any] | Callable[[str], Any],
|
|
385
|
+
write_dir: str = None, is_bulk: bool = False):
|
|
386
|
+
all_succeed = True
|
|
387
|
+
for hdfs_file_path in self.get_file_list(hdfs_dir):
|
|
388
|
+
if is_bulk:
|
|
389
|
+
succeed = await self._batch_process_file_bulk(hdfs_file_path, process_func, write_dir)
|
|
390
|
+
else:
|
|
391
|
+
succeed = await self._batch_process_file(hdfs_file_path, process_func, write_dir)
|
|
392
|
+
if succeed is False:
|
|
393
|
+
all_succeed = False
|
|
394
|
+
if all_succeed:
|
|
395
|
+
# 处理完成后删除数据库文件
|
|
396
|
+
try:
|
|
397
|
+
if os.path.exists(self.db_file):
|
|
398
|
+
os.remove(self.db_file)
|
|
399
|
+
print(f"已删除断点重试文件: {self.db_file}")
|
|
400
|
+
return True
|
|
401
|
+
except Exception as e:
|
|
402
|
+
print(f"删除断点重试文件失败: {e}")
|
|
403
|
+
return False
|
|
404
|
+
|
|
405
|
+
@retry(stop=stop_after_attempt(3),
|
|
406
|
+
wait=wait_random(min=10, max=30),
|
|
407
|
+
retry=retry_if_result(lambda result: not result), # 如果返回值是 False(失败),则重试 最后会抛出一个默认错误tenacity.RetryError:
|
|
408
|
+
reraise=True)
|
|
409
|
+
async def _batch_process_file_bulk(self, hdfs_file_path: str, process_func: Callable[[List[str]], Any],
|
|
410
|
+
write_dir: str = None):
|
|
411
|
+
"""批量处理 gz 文件中的数据"""
|
|
412
|
+
# 获取所有文件
|
|
413
|
+
# all_succeed = True
|
|
414
|
+
# for hdfs_file_path in self.get_file_list(hdfs_dir):
|
|
415
|
+
# # 查看是否跳过文件
|
|
416
|
+
# if self.db_handler.is_file_processed(hdfs_file_path):
|
|
417
|
+
# print(f"跳过已处理文件: {hdfs_file_path}")
|
|
418
|
+
# continue # 跳过已处理文件
|
|
419
|
+
# # 开始批量处理文件
|
|
420
|
+
# succeed = await self.retry_process_file_bulk(hdfs_file_path, process_func, write_dir)
|
|
421
|
+
# if succeed is False:
|
|
422
|
+
# all_succeed = False
|
|
423
|
+
#
|
|
424
|
+
# if all_succeed:
|
|
425
|
+
# # 处理完成后删除数据库文件
|
|
426
|
+
# try:
|
|
427
|
+
# if os.path.exists(self.db_file):
|
|
428
|
+
# os.remove(self.db_file)
|
|
429
|
+
# print(f"已删除断点重试文件: {self.db_file}")
|
|
430
|
+
# return True
|
|
431
|
+
# except Exception as e:
|
|
432
|
+
# print(f"删除断点重试文件失败: {e}")
|
|
433
|
+
# return False
|
|
434
|
+
# 查看是否跳过文件
|
|
435
|
+
if self.db_handler.is_file_processed(hdfs_file_path):
|
|
436
|
+
print(f"跳过已处理文件: {hdfs_file_path}")
|
|
437
|
+
return True # 跳过已处理文件
|
|
438
|
+
# 开始批量处理文件
|
|
439
|
+
succeed = await self.retry_process_file_bulk(hdfs_file_path, process_func, write_dir)
|
|
440
|
+
return succeed
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
# 全局变量,每个进程独立持有
|
|
444
|
+
_processor: DataProcessor | None = None
|
|
445
|
+
_process_func: Callable[[List[str]], Any] | Callable[[str], Any] | None = None
|
|
446
|
+
_process_args: dict
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def get_data_processor_func(process_args):
|
|
450
|
+
_func_reader = process_args["reader_func"]
|
|
451
|
+
_reader_args = process_args["reader_kwargs"]
|
|
452
|
+
reader = _func_reader(**_reader_args)
|
|
453
|
+
writer = None
|
|
454
|
+
if process_args["is_writer"]:
|
|
455
|
+
_func_writer = process_args["writer_func"]
|
|
456
|
+
_writer_args = process_args["writer_kwargs"]
|
|
457
|
+
writer = _func_writer(**_writer_args)
|
|
458
|
+
|
|
459
|
+
data_kwargs = {
|
|
460
|
+
"reader": reader,
|
|
461
|
+
"writer": writer,
|
|
462
|
+
"db_file": process_args["db_file"]
|
|
463
|
+
}
|
|
464
|
+
if process_args.get("batch_size"):
|
|
465
|
+
data_kwargs["batch_size"] = process_args["batch_size"]
|
|
466
|
+
if process_args.get("retry_limit"):
|
|
467
|
+
data_kwargs["retry_limit"] = process_args["retry_limit"]
|
|
468
|
+
|
|
469
|
+
return DataProcessor(**data_kwargs)
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def init_worker(process_func, process_args):
|
|
473
|
+
global _processor, _process_func, _process_args
|
|
474
|
+
_processor = get_data_processor_func(process_args)
|
|
475
|
+
_process_func = process_func
|
|
476
|
+
_process_args = process_args
|
|
477
|
+
|
|
478
|
+
_init_func = _process_args.get("init_work", None)
|
|
479
|
+
if _init_func:
|
|
480
|
+
_init_func()
|
|
481
|
+
|
|
482
|
+
_async_init_work = _process_args.get("async_init_work", None)
|
|
483
|
+
if _init_func:
|
|
484
|
+
asyncio.run(_async_init_work())
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def worker(path_file):
|
|
488
|
+
if _process_args["is_bulk"]:
|
|
489
|
+
return asyncio.run(_processor._batch_process_file_bulk(path_file, _process_func, _process_args["write_dir"]))
|
|
490
|
+
else:
|
|
491
|
+
return asyncio.run(_processor._batch_process_file(path_file, _process_func, _process_args["write_dir"]))
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def run_worker_many(hdfs_dir: str, process_func: Callable[[List[str]], Any] | Callable[[str], Any],
|
|
495
|
+
data_process_args: dict, max_workers=4):
|
|
496
|
+
processor = get_data_processor_func(data_process_args)
|
|
497
|
+
all_file = list(processor.get_file_list(hdfs_dir))
|
|
498
|
+
with ProcessPoolExecutor(
|
|
499
|
+
max_workers=max_workers,
|
|
500
|
+
initializer=init_worker,
|
|
501
|
+
initargs=(process_func, data_process_args)
|
|
502
|
+
) as executor:
|
|
503
|
+
# 提交任务并等待结果
|
|
504
|
+
results = executor.map(worker, all_file)
|
|
505
|
+
# 输出结果
|
|
506
|
+
for result in results:
|
|
507
|
+
if result:
|
|
508
|
+
print(result)
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
import io
|
|
3
|
+
import json
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List, Generator
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from hdfs import InsecureClient
|
|
10
|
+
|
|
11
|
+
from re_common.v2.baselibrary.tools.data_processer.base import BaseFileReader
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class HDFSFileReader(BaseFileReader):
|
|
15
|
+
def __init__(self, batch_size: int = 1000, hdfs_url: str = "http://VIP-DC-MASTER-2:9870", hdfs_user: str = "root"):
|
|
16
|
+
super().__init__(batch_size)
|
|
17
|
+
self.client = InsecureClient(hdfs_url, user=hdfs_user)
|
|
18
|
+
|
|
19
|
+
def list_files(self, path: str) -> List[str]:
|
|
20
|
+
return [f"{path}/{f[0]}" for f in self.client.list(path, status=True) if f[0] != '_SUCCESS']
|
|
21
|
+
|
|
22
|
+
def count_lines(self, file_path: str) -> int:
|
|
23
|
+
with self.client.read(file_path) as f:
|
|
24
|
+
return sum(1 for _ in f)
|
|
25
|
+
|
|
26
|
+
def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
|
|
27
|
+
# 批量读取后 处理 缺点 连接可能会断
|
|
28
|
+
with self.client.read(file_path) as f:
|
|
29
|
+
while True:
|
|
30
|
+
batch = []
|
|
31
|
+
for _ in range(self.batch_size):
|
|
32
|
+
try:
|
|
33
|
+
line = next(f)
|
|
34
|
+
line = line.decode('utf-8')
|
|
35
|
+
if line.strip():
|
|
36
|
+
batch.append(line.strip())
|
|
37
|
+
except StopIteration:
|
|
38
|
+
break
|
|
39
|
+
if not batch:
|
|
40
|
+
break
|
|
41
|
+
yield batch
|
|
42
|
+
|
|
43
|
+
def read_all(self, file_path: str) -> List[List[str]]:
|
|
44
|
+
# 一次读取返回所有后批量处理缺点 内存占用
|
|
45
|
+
with self.client.read(file_path) as f:
|
|
46
|
+
lines = [line.decode('utf-8').strip() for line in f if line.decode('utf-8').strip()]
|
|
47
|
+
return [lines[i: i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class HDFSGZFileReader(BaseFileReader):
|
|
51
|
+
def __init__(self, batch_size: int = 1000, hdfs_url: str = "http://VIP-DC-MASTER-2:9870", hdfs_user: str = "root"):
|
|
52
|
+
super().__init__(batch_size)
|
|
53
|
+
self.hdfs_url = hdfs_url
|
|
54
|
+
self.hdfs_user = hdfs_user
|
|
55
|
+
self.client = None
|
|
56
|
+
|
|
57
|
+
def _init_client(self):
|
|
58
|
+
if self.client is None:
|
|
59
|
+
self.client = InsecureClient(self.hdfs_url, user=self.hdfs_user)
|
|
60
|
+
return self
|
|
61
|
+
|
|
62
|
+
def list_files(self, path: str) -> List[str]:
|
|
63
|
+
self._init_client()
|
|
64
|
+
return [f"{path}/{f[0]}" for f in self.client.list(path, status=True) if f[0].endswith(".gz")]
|
|
65
|
+
|
|
66
|
+
def count_lines(self, file_path: str) -> int:
|
|
67
|
+
self._init_client()
|
|
68
|
+
with self.client.read(file_path) as f:
|
|
69
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
|
70
|
+
return sum(1 for _ in gz)
|
|
71
|
+
|
|
72
|
+
def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
|
|
73
|
+
self._init_client()
|
|
74
|
+
# 批量读取后 处理 缺点 连接可能会断
|
|
75
|
+
with self.client.read(file_path) as f:
|
|
76
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
|
77
|
+
while True:
|
|
78
|
+
batch = []
|
|
79
|
+
for _ in range(self.batch_size):
|
|
80
|
+
try:
|
|
81
|
+
line = next(gz)
|
|
82
|
+
if line.strip():
|
|
83
|
+
batch.append(line.decode("utf-8"))
|
|
84
|
+
except StopIteration:
|
|
85
|
+
break
|
|
86
|
+
if not batch:
|
|
87
|
+
break
|
|
88
|
+
yield batch
|
|
89
|
+
|
|
90
|
+
def read_all(self, file_path: str) -> List[List[str]]:
|
|
91
|
+
self._init_client()
|
|
92
|
+
# 一次读取返回所有后批量处理缺点 内存占用
|
|
93
|
+
with self.client.read(file_path) as reader:
|
|
94
|
+
compressed_data = reader.read()
|
|
95
|
+
with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file:
|
|
96
|
+
content = gz_file.read().decode("utf-8")
|
|
97
|
+
lines = [i for i in content.split("\n") if i.strip()]
|
|
98
|
+
return [lines[i: i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class HDFSParquetFileReader(BaseFileReader):
|
|
102
|
+
def __init__(self, batch_size: int = 1000, hdfs_url: str = "http://VIP-DC-MASTER-2:9870", hdfs_user: str = "root"):
|
|
103
|
+
super().__init__(batch_size)
|
|
104
|
+
self.client = InsecureClient(hdfs_url, user=hdfs_user)
|
|
105
|
+
|
|
106
|
+
def list_files(self, path: str) -> List[str]:
|
|
107
|
+
return [f"{path}/{f[0]}" for f in self.client.list(path, status=True) if f[0].endswith(".parquet")]
|
|
108
|
+
|
|
109
|
+
def count_lines(self, file_path: str) -> int:
|
|
110
|
+
with self.client.read(file_path) as f:
|
|
111
|
+
data = f.read()
|
|
112
|
+
df = pd.read_parquet(io.BytesIO(data))
|
|
113
|
+
count = len(df)
|
|
114
|
+
return count
|
|
115
|
+
|
|
116
|
+
def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
|
|
117
|
+
# 批量读取后 处理 缺点 连接可能会断
|
|
118
|
+
with self.client.read(file_path) as f:
|
|
119
|
+
data = f.read()
|
|
120
|
+
df = pd.read_parquet(io.BytesIO(data))
|
|
121
|
+
records = [json.dumps(row, ensure_ascii=False) for row in df.to_dict(orient='records')]
|
|
122
|
+
for i in range(0, len(records), self.batch_size):
|
|
123
|
+
yield records[i: i + self.batch_size]
|
|
124
|
+
|
|
125
|
+
def read_all(self, file_path: str) -> List[List[str]]:
|
|
126
|
+
# 一次读取返回所有后批量处理缺点 内存占用
|
|
127
|
+
with self.client.read(file_path) as f:
|
|
128
|
+
data = f.read()
|
|
129
|
+
df = pd.read_parquet(io.BytesIO(data))
|
|
130
|
+
records = [json.dumps(row, ensure_ascii=False) for row in df.to_dict(orient='records')]
|
|
131
|
+
return [records[i: i + self.batch_size] for i in range(0, len(records), self.batch_size)]
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class LocalGZFileReader(BaseFileReader):
|
|
135
|
+
def list_files(self, path: str) -> List[str]:
|
|
136
|
+
return [str(p) for p in Path(path).rglob("*.gz")]
|
|
137
|
+
|
|
138
|
+
def count_lines(self, file_path: str) -> int:
|
|
139
|
+
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
|
|
140
|
+
return sum(1 for _ in f)
|
|
141
|
+
|
|
142
|
+
def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
|
|
143
|
+
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
|
|
144
|
+
while True:
|
|
145
|
+
batch = []
|
|
146
|
+
for _ in range(self.batch_size):
|
|
147
|
+
line = f.readline()
|
|
148
|
+
if not line:
|
|
149
|
+
break
|
|
150
|
+
if line.strip():
|
|
151
|
+
batch.append(line.strip())
|
|
152
|
+
if not batch:
|
|
153
|
+
break
|
|
154
|
+
yield batch
|
|
155
|
+
|
|
156
|
+
def read_all(self, file_path: str) -> List[List[str]]:
|
|
157
|
+
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
|
|
158
|
+
lines = [line.strip() for line in f if line.strip()]
|
|
159
|
+
return [lines[i: i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class LocalFileReader(BaseFileReader):
|
|
163
|
+
def list_files(self, path: str) -> List[str]:
|
|
164
|
+
return [str(p) for p in Path(path).rglob("*") if p.is_file()]
|
|
165
|
+
|
|
166
|
+
def count_lines(self, file_path: str) -> int:
|
|
167
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
168
|
+
return sum(1 for _ in f)
|
|
169
|
+
|
|
170
|
+
def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
|
|
171
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
172
|
+
while True:
|
|
173
|
+
batch = []
|
|
174
|
+
for _ in range(self.batch_size):
|
|
175
|
+
line = f.readline()
|
|
176
|
+
if not line:
|
|
177
|
+
break
|
|
178
|
+
if line.strip():
|
|
179
|
+
batch.append(line.strip())
|
|
180
|
+
if not batch:
|
|
181
|
+
break
|
|
182
|
+
yield batch
|
|
183
|
+
|
|
184
|
+
def read_all(self, file_path: str) -> List[List[str]]:
|
|
185
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
186
|
+
lines = [line.strip() for line in f if line.strip()]
|
|
187
|
+
return [lines[i: i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from hdfs import InsecureClient
|
|
6
|
+
|
|
7
|
+
from re_common.v2.baselibrary.tools.data_processer.base import BaseFileWriter
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class HDFSFileWriter(BaseFileWriter):
|
|
11
|
+
def __init__(self, file_path: str, hdfs_url: str, hdfs_user: str, *args, **kwargs):
|
|
12
|
+
super().__init__(file_path, *args, **kwargs)
|
|
13
|
+
self.client = InsecureClient(hdfs_url, user=hdfs_user)
|
|
14
|
+
|
|
15
|
+
def write_lines(self, lines: List[str], file_path: str = None):
|
|
16
|
+
if file_path is None:
|
|
17
|
+
file_path = self.file_path
|
|
18
|
+
data = "\n".join(lines).encode(self.encoding)
|
|
19
|
+
if self.compress:
|
|
20
|
+
buf = BytesIO()
|
|
21
|
+
with gzip.GzipFile(fileobj=buf, mode="wb") as gz:
|
|
22
|
+
gz.write(data)
|
|
23
|
+
buf.seek(0)
|
|
24
|
+
self.client.write(file_path, data=buf, overwrite=self.overwrite)
|
|
25
|
+
else:
|
|
26
|
+
self.client.write(file_path, data=data, overwrite=self.overwrite)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class LocalFileWriter(BaseFileWriter):
|
|
30
|
+
def write_lines(self, lines: List[str], file_path: str, compress: bool = True, encoding="utf-8"):
|
|
31
|
+
if compress:
|
|
32
|
+
with gzip.open(file_path, 'wt', encoding=encoding) as f:
|
|
33
|
+
for line in lines:
|
|
34
|
+
f.write(f"{line}\n")
|
|
35
|
+
else:
|
|
36
|
+
with open(file_path, 'w', encoding=encoding) as f:
|
|
37
|
+
for line in lines:
|
|
38
|
+
f.write(f"{line}\n")
|
|
@@ -63,4 +63,8 @@ def list_to_dict(list_data,key_name):
|
|
|
63
63
|
|
|
64
64
|
# 将 defaultdict 转换成普通字典
|
|
65
65
|
dict_data = dict(dict_data)
|
|
66
|
-
return dict_data
|
|
66
|
+
return dict_data
|
|
67
|
+
|
|
68
|
+
def split_list_by_step(lst, step=100):
|
|
69
|
+
# 一维列表按照步长转换成二维列表
|
|
70
|
+
return [lst[i:i + step] for i in range(0, len(lst), step)]
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
import sys
|
|
3
|
+
import asyncio
|
|
4
|
+
import aiohttp
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from tenacity import retry, stop_after_attempt, wait_random
|
|
8
|
+
|
|
9
|
+
g_headers = {
|
|
10
|
+
'accept': 'application/json',
|
|
11
|
+
'Content-Type': 'application/json',
|
|
12
|
+
'Authorization': 'Bearer eyJhbGciOiJub25lIiwidHlwIjoiSldUIn0.eyJ1c2VyX2lkIjotMSwidXNlcl9uYW1lIjoiXHU1ZTk0XHU3NTI4XHU0ZTJkXHU1ZmMzQ2xpZW50In0.'
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
cls._conn = aiohttp.TCPConnector(
|
|
17
|
+
limit=50, # 最大连接数
|
|
18
|
+
ssl=False, # 禁用SSL验证(按需开启)
|
|
19
|
+
force_close=True, # 保持连接活跃
|
|
20
|
+
enable_cleanup_closed=True # 自动清理关闭的连接
|
|
21
|
+
)
|
|
22
|
+
# 由于网络上有重名,没有连接。如果加入域,请转到“控制面板”中的“系统”更改计算机名,然后重试。如果加入工作组,请选择其他工作组名。
|
|
23
|
+
有可能是
|
|
24
|
+
force_close=True, # 保持连接活跃
|
|
25
|
+
enable_cleanup_closed=True # 自动清理关闭的连接
|
|
26
|
+
照成的
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class HttpError(Exception):
|
|
31
|
+
code = 0
|
|
32
|
+
message = ""
|
|
33
|
+
headers = None
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
*,
|
|
38
|
+
code: Optional[int] = None,
|
|
39
|
+
message: str = "",
|
|
40
|
+
headers: Optional[dict] = None,
|
|
41
|
+
) -> None:
|
|
42
|
+
if code is not None:
|
|
43
|
+
self.code = code
|
|
44
|
+
self.headers = headers
|
|
45
|
+
self.message = message
|
|
46
|
+
|
|
47
|
+
def __str__(self) -> str:
|
|
48
|
+
return f"code: {self.code}, message:{self.message}"
|
|
49
|
+
|
|
50
|
+
def __repr__(self) -> str:
|
|
51
|
+
return f"<{self.__class__.__name__}: code={self.code}, message={self.message!r}>"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def on_retry_error(retry_state):
|
|
55
|
+
# 最后抛错后调用
|
|
56
|
+
original_exc = retry_state.outcome.exception()
|
|
57
|
+
print(f"[HTTP 请求重试所有重试失败.] 错误消息{original_exc}")
|
|
58
|
+
|
|
59
|
+
raise HttpError(code=getattr(original_exc, 'code', 455),
|
|
60
|
+
message=f"错误消息:{str(original_exc)}") from original_exc
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def on_retry(retry_state):
|
|
64
|
+
# 每次抛错进入该函数打印消息
|
|
65
|
+
print(
|
|
66
|
+
f"[HTTP 请求重试]"
|
|
67
|
+
f"当前重试 : 第 {retry_state.attempt_number} 次"
|
|
68
|
+
f"睡眠时间 : {retry_state.next_action.sleep:.2f} 秒"
|
|
69
|
+
f"\n异常原因 : {retry_state.outcome.exception()}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class ApiNetUtils:
|
|
74
|
+
"""
|
|
75
|
+
HTTP请求工具类(异步版),提供GET/POST/PATCH请求方法
|
|
76
|
+
特性:
|
|
77
|
+
1. 自动复用TCP连接池
|
|
78
|
+
2. 自动重试机制(通过async_retry装饰器)
|
|
79
|
+
3. 进程退出时自动清理资源
|
|
80
|
+
4. 线程安全的延迟初始化
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
# 类属性使用Optional类型注解,初始化为None实现延迟初始化
|
|
84
|
+
_conn: Optional[aiohttp.TCPConnector] = None
|
|
85
|
+
_session: Optional[aiohttp.ClientSession] = None
|
|
86
|
+
_close_registered: bool = False # 确保清理函数只注册一次
|
|
87
|
+
|
|
88
|
+
@classmethod
|
|
89
|
+
async def _get_connector(cls) -> aiohttp.TCPConnector:
|
|
90
|
+
"""
|
|
91
|
+
获取TCP连接器(延迟初始化)
|
|
92
|
+
解决模块加载时没有事件循环的问题
|
|
93
|
+
"""
|
|
94
|
+
if cls._conn is None or cls._conn.closed or cls.is_loop_closed(cls._session):
|
|
95
|
+
# 只有在首次使用时才创建连接器
|
|
96
|
+
cls._conn = aiohttp.TCPConnector(
|
|
97
|
+
limit=50, # 最大连接数
|
|
98
|
+
ssl=False, # 禁用SSL验证(按需开启)
|
|
99
|
+
force_close=True, # 保持连接活跃
|
|
100
|
+
# enable_cleanup_closed=True, # 自动清理关闭的连接 #
|
|
101
|
+
# keepalive_timeout=4.99 # 比服务器的5s 小一点
|
|
102
|
+
)
|
|
103
|
+
return cls._conn
|
|
104
|
+
|
|
105
|
+
@classmethod
|
|
106
|
+
async def _get_session(cls) -> aiohttp.ClientSession:
|
|
107
|
+
"""
|
|
108
|
+
获取共享会话(线程安全的延迟初始化)
|
|
109
|
+
包含自动注册清理机制
|
|
110
|
+
"""
|
|
111
|
+
if cls._session is None or cls._session.closed or cls.is_loop_closed(cls._session):
|
|
112
|
+
if cls._session:
|
|
113
|
+
await cls.close()
|
|
114
|
+
# 获取连接器(会自动初始化)
|
|
115
|
+
connector = await cls._get_connector()
|
|
116
|
+
|
|
117
|
+
# 强制获取新的事件循环
|
|
118
|
+
loop = asyncio.get_event_loop()
|
|
119
|
+
|
|
120
|
+
# 创建新会话
|
|
121
|
+
cls._session = aiohttp.ClientSession(
|
|
122
|
+
connector=connector,
|
|
123
|
+
timeout=aiohttp.ClientTimeout(total=30), # 默认30秒超时
|
|
124
|
+
loop=loop) # 显式指定事件循环
|
|
125
|
+
|
|
126
|
+
# # 注册退出时的清理钩子
|
|
127
|
+
cls._register_cleanup()
|
|
128
|
+
|
|
129
|
+
return cls._session
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def is_loop_closed(session: aiohttp.ClientSession) -> bool:
|
|
133
|
+
"""
|
|
134
|
+
检查会话绑定的事件循环是否已关闭
|
|
135
|
+
"""
|
|
136
|
+
loop = session._loop # 获取会话绑定的事件循环
|
|
137
|
+
if loop.is_closed():
|
|
138
|
+
# print("Event loop is closed")
|
|
139
|
+
return True
|
|
140
|
+
# print("Event loop not is closed")
|
|
141
|
+
return False
|
|
142
|
+
|
|
143
|
+
@classmethod
|
|
144
|
+
def _register_cleanup(cls):
|
|
145
|
+
"""
|
|
146
|
+
注册进程退出时的资源清理函数
|
|
147
|
+
包含正常退出和异常退出两种情况
|
|
148
|
+
"""
|
|
149
|
+
if not cls._close_registered:
|
|
150
|
+
# 1. 正常退出处理
|
|
151
|
+
atexit.register(lambda: asyncio.run(cls.close()))
|
|
152
|
+
|
|
153
|
+
# 2. 异常退出处理
|
|
154
|
+
original_excepthook = sys.excepthook
|
|
155
|
+
|
|
156
|
+
def custom_excepthook(exctype, value, traceback):
|
|
157
|
+
"""自定义异常钩子,确保资源被清理"""
|
|
158
|
+
# 先执行原始异常处理(打印堆栈等)
|
|
159
|
+
original_excepthook(exctype, value, traceback)
|
|
160
|
+
# 然后执行资源清理
|
|
161
|
+
try:
|
|
162
|
+
asyncio.run(cls.close())
|
|
163
|
+
except RuntimeError:
|
|
164
|
+
# 如果已经没有事件循环,则同步执行
|
|
165
|
+
loop = asyncio.new_event_loop()
|
|
166
|
+
loop.run_until_complete(cls.close())
|
|
167
|
+
loop.close()
|
|
168
|
+
|
|
169
|
+
sys.excepthook = custom_excepthook
|
|
170
|
+
cls._close_registered = True
|
|
171
|
+
|
|
172
|
+
@classmethod
|
|
173
|
+
async def close(cls):
|
|
174
|
+
"""
|
|
175
|
+
安全关闭所有网络资源
|
|
176
|
+
会自动在程序退出时调用,也可手动调用
|
|
177
|
+
"""
|
|
178
|
+
if cls._session and not cls._session.closed:
|
|
179
|
+
await cls._session.close()
|
|
180
|
+
cls._session = None
|
|
181
|
+
|
|
182
|
+
if cls._conn and not cls._conn.closed:
|
|
183
|
+
await cls._conn.close()
|
|
184
|
+
cls._conn = None
|
|
185
|
+
|
|
186
|
+
# print("[ApiNetUtils] 网络资源已安全释放")
|
|
187
|
+
|
|
188
|
+
# -------------------- 公共API方法 -------------------- #
|
|
189
|
+
|
|
190
|
+
@classmethod
|
|
191
|
+
@retry(stop=stop_after_attempt(4), # 本质上执行4次 但重试3次
|
|
192
|
+
wait=wait_random(min=5, max=15),
|
|
193
|
+
before_sleep=on_retry, # 每次抛错后使用
|
|
194
|
+
retry_error_callback=on_retry_error,
|
|
195
|
+
reraise=True)
|
|
196
|
+
async def fetch_get(cls, url: str, headers=None, params=None):
|
|
197
|
+
"""
|
|
198
|
+
GET请求封装
|
|
199
|
+
:param url: 请求URL
|
|
200
|
+
:param headers: 可选请求头(默认使用全局g_headers)
|
|
201
|
+
:param params: 查询参数(字典)
|
|
202
|
+
:return: 解析后的JSON数据
|
|
203
|
+
:raises HttpError: 当状态码非200时抛出
|
|
204
|
+
"""
|
|
205
|
+
headers = headers or g_headers
|
|
206
|
+
session = await cls._get_session()
|
|
207
|
+
|
|
208
|
+
async with session.get(url, headers=headers, params=params) as response:
|
|
209
|
+
if response.status != 200:
|
|
210
|
+
error_text = await response.text()
|
|
211
|
+
raise HttpError(
|
|
212
|
+
code=response.status,
|
|
213
|
+
message=f"请求失败: url={url}, status={response.status}, 错误详情={error_text}"
|
|
214
|
+
)
|
|
215
|
+
return await response.json()
|
|
216
|
+
|
|
217
|
+
@classmethod
|
|
218
|
+
@retry(stop=stop_after_attempt(4),
|
|
219
|
+
wait=wait_random(min=5, max=15),
|
|
220
|
+
before_sleep=on_retry, # 每次抛错后使用
|
|
221
|
+
retry_error_callback=on_retry_error,
|
|
222
|
+
reraise=True)
|
|
223
|
+
async def fetch_post(cls, url: str, payload: dict, headers=None):
|
|
224
|
+
"""
|
|
225
|
+
POST请求封装(JSON格式)
|
|
226
|
+
"""
|
|
227
|
+
headers = headers or g_headers
|
|
228
|
+
session = await cls._get_session()
|
|
229
|
+
|
|
230
|
+
async with session.post(url, json=payload, headers=headers) as response:
|
|
231
|
+
if response.status != 200:
|
|
232
|
+
error_text = await response.text()
|
|
233
|
+
raise HttpError(
|
|
234
|
+
code=response.status,
|
|
235
|
+
message=f"请求失败: url={url}, status={response.status}, 错误详情={error_text}"
|
|
236
|
+
)
|
|
237
|
+
return await response.json()
|
|
238
|
+
|
|
239
|
+
@classmethod
|
|
240
|
+
@retry(stop=stop_after_attempt(4),
|
|
241
|
+
wait=wait_random(min=5, max=15),
|
|
242
|
+
before_sleep=on_retry, # 每次抛错后使用
|
|
243
|
+
retry_error_callback=on_retry_error,
|
|
244
|
+
reraise=True)
|
|
245
|
+
async def fetch_patch(cls, url: str, payload: dict, headers=None):
|
|
246
|
+
"""
|
|
247
|
+
PATCH请求封装(JSON格式)
|
|
248
|
+
"""
|
|
249
|
+
headers = headers or g_headers
|
|
250
|
+
session = await cls._get_session()
|
|
251
|
+
|
|
252
|
+
async with session.patch(url, json=payload, headers=headers) as response:
|
|
253
|
+
if response.status != 200:
|
|
254
|
+
error_text = await response.text()
|
|
255
|
+
raise HttpError(
|
|
256
|
+
code=response.status,
|
|
257
|
+
message=f"请求失败: url={url}, status={response.status}, 错误详情={error_text}"
|
|
258
|
+
)
|
|
259
|
+
return await response.json()
|
|
260
|
+
|
|
261
|
+
@classmethod
|
|
262
|
+
async def __aenter__(cls):
|
|
263
|
+
"""支持async with语法"""
|
|
264
|
+
await cls._get_session()
|
|
265
|
+
return cls
|
|
266
|
+
|
|
267
|
+
@classmethod
|
|
268
|
+
async def __aexit__(cls, exc_type, exc, tb):
|
|
269
|
+
"""async with退出时自动关闭"""
|
|
270
|
+
await cls.close()
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
import os
|
|
1
3
|
from contextlib import asynccontextmanager
|
|
2
4
|
from typing import AsyncGenerator, Tuple
|
|
3
5
|
|
|
4
|
-
import
|
|
5
|
-
from aiomysql import Pool, Connection, Cursor, DictCursor
|
|
6
|
+
from aiomysql import Pool, Connection, Cursor
|
|
6
7
|
|
|
7
8
|
DB_CONFIG = {
|
|
8
9
|
'host': '192.168.98.55',
|
|
@@ -18,10 +19,27 @@ DB_CONFIG = {
|
|
|
18
19
|
'echo': False, # 打印SQL语句
|
|
19
20
|
}
|
|
20
21
|
|
|
22
|
+
DB_CONFIG1 = {
|
|
23
|
+
'host': '192.168.98.55',
|
|
24
|
+
'port': 4000,
|
|
25
|
+
'user': 'foreign_fulltextUser',
|
|
26
|
+
'password': 'i4hIeasw1qpmhGN2nwL7',
|
|
27
|
+
'db': 'foreign_fulltext',
|
|
28
|
+
'charset': 'utf8mb4',
|
|
29
|
+
'minsize': 16, # 最小连接数
|
|
30
|
+
'maxsize': 128, # 最大连接数
|
|
31
|
+
'autocommit': False, # 自动提交事务
|
|
32
|
+
'pool_recycle': 3600, # 每个连接的回收时间(秒),超过此时间后连接将被关闭并重新创建,避免失效连接
|
|
33
|
+
'echo': False, # 打印SQL语句
|
|
34
|
+
}
|
|
35
|
+
|
|
21
36
|
|
|
22
37
|
@asynccontextmanager
|
|
23
|
-
async def get_db_pool():
|
|
38
|
+
async def get_db_pool(_DB_CONFIG: dict = None):
|
|
24
39
|
"""异步数据库连接池管理工具"""
|
|
40
|
+
global DB_CONFIG
|
|
41
|
+
if _DB_CONFIG is not None:
|
|
42
|
+
DB_CONFIG = _DB_CONFIG
|
|
25
43
|
pool: Pool = await aiomysql.create_pool(**DB_CONFIG)
|
|
26
44
|
try:
|
|
27
45
|
yield pool
|
|
@@ -36,3 +54,21 @@ async def get_session(pool: Pool) -> AsyncGenerator[Tuple[Connection, Cursor], N
|
|
|
36
54
|
async with pool.acquire() as conn:
|
|
37
55
|
async with conn.cursor() as cursor:
|
|
38
56
|
yield conn, cursor
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# main.py
|
|
60
|
+
import aiomysql
|
|
61
|
+
import asyncio
|
|
62
|
+
|
|
63
|
+
aiomysql_pool = None
|
|
64
|
+
pool_lock = asyncio.Lock() # 全局异步锁
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
async def init_aiomysql_pool_async():
|
|
68
|
+
global aiomysql_pool
|
|
69
|
+
if aiomysql_pool is None:
|
|
70
|
+
async with pool_lock:
|
|
71
|
+
if aiomysql_pool is None:
|
|
72
|
+
print(f"[{os.getpid()}] Initializing aiomysql pool...")
|
|
73
|
+
aiomysql_pool = await aiomysql.create_pool(**DB_CONFIG)
|
|
74
|
+
return aiomysql_pool
|
|
@@ -9,7 +9,7 @@ from re_common.v2.baselibrary.utils.stringutils import (
|
|
|
9
9
|
bj2qj,
|
|
10
10
|
get_diacritic_variant,
|
|
11
11
|
clean_html,
|
|
12
|
-
remove_spaces_between_chinese_characters,
|
|
12
|
+
remove_spaces_between_chinese_characters, clean_unicode_alnum,
|
|
13
13
|
)
|
|
14
14
|
|
|
15
15
|
|
|
@@ -91,6 +91,11 @@ class StringClear(object):
|
|
|
91
91
|
self.obj_str = re.sub(r"[^\w\s]", "", self.obj_str)
|
|
92
92
|
return self
|
|
93
93
|
|
|
94
|
+
def remove_all_symbols(self):
|
|
95
|
+
# 一种更加强力的符号清理 只保留各个国家的字符 和各个国家的数字
|
|
96
|
+
self.obj_str = clean_unicode_alnum(self.obj_str)
|
|
97
|
+
return self
|
|
98
|
+
|
|
94
99
|
def remove_underline(self):
|
|
95
100
|
# 下划线在 \w 中 所以这里独立封装
|
|
96
101
|
self.obj_str = re.sub("[_]", "", self.obj_str)
|
|
@@ -211,3 +211,18 @@ def get_group_abstract(lists):
|
|
|
211
211
|
t_list.append(keyid_list[text_idx])
|
|
212
212
|
all_list.append(t_list)
|
|
213
213
|
return all_list
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def clean_unicode_alnum(text: str) -> str:
|
|
217
|
+
"""
|
|
218
|
+
清除所有非 Unicode 字母或数字的字符。
|
|
219
|
+
|
|
220
|
+
参数:
|
|
221
|
+
text (str): 输入文本。
|
|
222
|
+
|
|
223
|
+
返回:
|
|
224
|
+
str: 只包含 Unicode 字母和数字的文本。
|
|
225
|
+
\p{N} 匹配所有 Unicode 数字字符 包括非阿拉伯数字字符
|
|
226
|
+
\p{L} 匹配所有语言字符
|
|
227
|
+
"""
|
|
228
|
+
return regex.sub(r"[^\p{L}\p{N}]+", "", text)
|
|
@@ -163,7 +163,7 @@ re_common/studio/streamlitstudio/first_app.py,sha256=t7Fw8YDlub7G9q99GgVo_3sPZXU
|
|
|
163
163
|
re_common/studio/streamlitstudio/uber_pickups.py,sha256=cvrV5e8vRBM2_CpVDBE-f3V4mGFK9SqpRPZK8TEqr6U,785
|
|
164
164
|
re_common/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
165
165
|
re_common/v2/baselibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
166
|
-
re_common/v2/baselibrary/business_utils/BusinessStringUtil.py,sha256=
|
|
166
|
+
re_common/v2/baselibrary/business_utils/BusinessStringUtil.py,sha256=njPcRgeBWpnZr5u2cPAO4qdWBq-CgTn99rJuvWFcChk,6788
|
|
167
167
|
re_common/v2/baselibrary/business_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
168
168
|
re_common/v2/baselibrary/business_utils/rel_tools.py,sha256=LfnGFCkUSxg1SHvOMOQdP1PiHxIKqk7Syuk5YYpjJag,295
|
|
169
169
|
re_common/v2/baselibrary/decorators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -174,29 +174,35 @@ re_common/v2/baselibrary/s3object/baseboto3.py,sha256=mXuIFx99pnrPGQ4LJCZwlN1HLb
|
|
|
174
174
|
re_common/v2/baselibrary/tools/WeChatRobot.py,sha256=sKBt2gPsfj0gzV6KaLSAhIhL-j3qNfHfqE-lII1LVwM,3537
|
|
175
175
|
re_common/v2/baselibrary/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
176
176
|
re_common/v2/baselibrary/tools/ac_ahocorasick.py,sha256=c63y5RtKVLD37nyPCnBqfNygwRj4gTQqyIdDOrC65G0,2847
|
|
177
|
-
re_common/v2/baselibrary/tools/dict_tools.py,sha256=
|
|
177
|
+
re_common/v2/baselibrary/tools/dict_tools.py,sha256=eSMwPTLp3oSjuviC_wlXg0I-dnkkmZfUfCRLX5djWV8,1365
|
|
178
178
|
re_common/v2/baselibrary/tools/dolphinscheduler.py,sha256=1m7UGYDiuvJUCI6ik6CGM2fO8U5XteJzn55VRbwB9ts,7978
|
|
179
179
|
re_common/v2/baselibrary/tools/hdfs_data_processer.py,sha256=g0DaNjXM1hIUblFQ6YBwnwEBKIXn48X8Y9Eiok4dVlQ,14824
|
|
180
|
-
re_common/v2/baselibrary/tools/list_tools.py,sha256=
|
|
180
|
+
re_common/v2/baselibrary/tools/list_tools.py,sha256=1NxGVM4EytSXh4IGAEfZQnvq0Ev-UOF-PGZBg2EQbOg,2132
|
|
181
181
|
re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=2ENLtZE8opRsfkwRtTNMzITmpTsjO7wZ1ZkfkqpOH9U,1937
|
|
182
182
|
re_common/v2/baselibrary/tools/text_matcher.py,sha256=cPMoFxaA0-ce3tLRxVSs8_3pTYS1oVIHDnNy_AlPU-4,10756
|
|
183
183
|
re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
|
|
184
|
+
re_common/v2/baselibrary/tools/data_processer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
185
|
+
re_common/v2/baselibrary/tools/data_processer/base.py,sha256=i6HA2UQsSRZaKxW1wJMpiC9LAy3wYaI2BVxUAiFoRZ4,1704
|
|
186
|
+
re_common/v2/baselibrary/tools/data_processer/data_processer.py,sha256=R7zHQG8eo3mfckYr-Pp53fyyQj6zd8fuweSxwzvDgN0,22683
|
|
187
|
+
re_common/v2/baselibrary/tools/data_processer/data_reader.py,sha256=LWLbom7W2L0T6q38crA1_Gcvxkzk9Lm0btJjrmtMHMU,7945
|
|
188
|
+
re_common/v2/baselibrary/tools/data_processer/data_writer.py,sha256=OgKZ06zRJYNx758rbjxZG_KNgkLuVLlyB1AvyRsJtS4,1447
|
|
184
189
|
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=njPcRgeBWpnZr5u2cPAO4qdWBq-CgTn99rJuvWFcChk,6788
|
|
185
190
|
re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
191
|
+
re_common/v2/baselibrary/utils/api_net_utils.py,sha256=22q3WMWiKVg1IVGr4y2D1JrjhnbQtlChRDJm2S8rGlc,9868
|
|
186
192
|
re_common/v2/baselibrary/utils/author_smi.py,sha256=1ebH3AHv19jtJWdlqNdwu6t58HNVLCotuCB6ff1SWiw,13666
|
|
187
193
|
re_common/v2/baselibrary/utils/base_string_similarity.py,sha256=a40a79ttwoG_gC_hxMNB-sMXXecgICoRDWrj0DW8iEE,7749
|
|
188
194
|
re_common/v2/baselibrary/utils/basedict.py,sha256=sH3_RZ8u4649-jX2V1uKNNkjJVUijZBDp6SdqncOZ88,1583
|
|
189
195
|
re_common/v2/baselibrary/utils/basehdfs.py,sha256=TPwFct_-UrmO1KCbo4gpV77rsnlCQDumNBbQKL0ZI9o,5953
|
|
190
196
|
re_common/v2/baselibrary/utils/basepika.py,sha256=ifOb3UsGj79k40aD9UK6-5BMPw43ZAo0SO3AYD4q4vw,7332
|
|
191
197
|
re_common/v2/baselibrary/utils/basetime.py,sha256=b7U_ho6nE3fjYBxSkdMHXUOd3ClH6KkW_7p7l2Gs4gA,3038
|
|
192
|
-
re_common/v2/baselibrary/utils/db.py,sha256=
|
|
198
|
+
re_common/v2/baselibrary/utils/db.py,sha256=ouDagXqqY9h4ucK4LDGrYVY-31rOiBQFxXLIlio9AJA,2297
|
|
193
199
|
re_common/v2/baselibrary/utils/json_cls.py,sha256=M93piYtmgm_wP8E57culTrd_AhHLoGg6PqeAJYdW2SM,438
|
|
194
200
|
re_common/v2/baselibrary/utils/mq.py,sha256=UHpO8iNIHs91Tgp-BgnSUpZwjWquxrGLdpr3FMMv2zw,2858
|
|
195
201
|
re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
|
|
196
202
|
re_common/v2/baselibrary/utils/string_bool.py,sha256=vxnjSFOfuHWGxkqaIbUNn21opx5tfV1uCXSahFfp1mU,6197
|
|
197
|
-
re_common/v2/baselibrary/utils/string_clear.py,sha256=
|
|
203
|
+
re_common/v2/baselibrary/utils/string_clear.py,sha256=ywYR1KrKQyeM-zJgvTmORlfgbLdRSjWWKPe7K8oRx_8,7450
|
|
198
204
|
re_common/v2/baselibrary/utils/string_smi.py,sha256=cU0WAWHRGnGoVQx3eCEKeM_q_olFNzRTJe7rSe586SY,741
|
|
199
|
-
re_common/v2/baselibrary/utils/stringutils.py,sha256=
|
|
205
|
+
re_common/v2/baselibrary/utils/stringutils.py,sha256=TI6fw3km1l25ufXrnG6ha8dSBDtRh-MF4nWRt9u8Xbo,6452
|
|
200
206
|
re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
201
207
|
re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
|
|
202
208
|
re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
|
|
@@ -223,8 +229,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
|
|
|
223
229
|
re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
|
|
224
230
|
re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
|
|
225
231
|
re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
|
|
226
|
-
re_common-10.0.
|
|
227
|
-
re_common-10.0.
|
|
228
|
-
re_common-10.0.
|
|
229
|
-
re_common-10.0.
|
|
230
|
-
re_common-10.0.
|
|
232
|
+
re_common-10.0.26.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
233
|
+
re_common-10.0.26.dist-info/METADATA,sha256=kHLVPF-e0PjpnUL7dN9pAMqK_pw4yHwZGKxbJ_zlAY0,582
|
|
234
|
+
re_common-10.0.26.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
235
|
+
re_common-10.0.26.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
|
|
236
|
+
re_common-10.0.26.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|