re-common 10.0.13__py3-none-any.whl → 10.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/v2/baselibrary/decorators/__init__.py +0 -0
- re_common/v2/baselibrary/decorators/utils.py +59 -0
- re_common/v2/baselibrary/tools/ac_ahocorasick.py +76 -0
- re_common/v2/baselibrary/tools/hdfs_data_processer.py +318 -0
- re_common/v2/baselibrary/tools/search_hash_tools.py +4 -3
- re_common/v2/baselibrary/tools/text_matcher.py +131 -28
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +21 -7
- re_common/v2/baselibrary/utils/basepika.py +180 -0
- re_common/v2/baselibrary/utils/db.py +38 -0
- re_common/v2/baselibrary/utils/mq.py +83 -0
- re_common/v2/baselibrary/utils/string_bool.py +13 -1
- re_common/v2/baselibrary/utils/string_clear.py +9 -0
- {re_common-10.0.13.dist-info → re_common-10.0.15.dist-info}/METADATA +1 -1
- {re_common-10.0.13.dist-info → re_common-10.0.15.dist-info}/RECORD +17 -10
- {re_common-10.0.13.dist-info → re_common-10.0.15.dist-info}/LICENSE +0 -0
- {re_common-10.0.13.dist-info → re_common-10.0.15.dist-info}/WHEEL +0 -0
- {re_common-10.0.13.dist-info → re_common-10.0.15.dist-info}/top_level.txt +0 -0
|
File without changes
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
import functools
|
|
3
|
+
|
|
4
|
+
# 全局集合,用于记录已警告的函数或类
|
|
5
|
+
_warned_once = set()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def deprecated(message=None):
|
|
9
|
+
"""
|
|
10
|
+
装饰器:标记函数或类为已废弃,整个进程只发出一次警告。
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
message (str): 自定义警告信息,默认为 None。
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def decorator(obj):
|
|
17
|
+
# 如果是函数
|
|
18
|
+
if isinstance(obj, type(lambda: None)):
|
|
19
|
+
@functools.wraps(obj)
|
|
20
|
+
def wrapper(*args, **kwargs):
|
|
21
|
+
obj_id = id(obj) # 使用对象的内存地址作为唯一标识
|
|
22
|
+
if obj_id not in _warned_once:
|
|
23
|
+
default_msg = f"函数 {obj.__name__} 已不建议使用。"
|
|
24
|
+
warn_msg = f"{default_msg} {message}" if message else default_msg
|
|
25
|
+
warnings.warn(
|
|
26
|
+
warn_msg,
|
|
27
|
+
category=DeprecationWarning,
|
|
28
|
+
stacklevel=2
|
|
29
|
+
)
|
|
30
|
+
_warned_once.add(obj_id) # 记录已警告
|
|
31
|
+
return obj(*args, **kwargs)
|
|
32
|
+
|
|
33
|
+
return wrapper
|
|
34
|
+
|
|
35
|
+
# 如果是类
|
|
36
|
+
elif isinstance(obj, type):
|
|
37
|
+
orig_init = obj.__init__
|
|
38
|
+
|
|
39
|
+
@functools.wraps(orig_init)
|
|
40
|
+
def new_init(self, *args, **kwargs):
|
|
41
|
+
obj_id = id(obj)
|
|
42
|
+
if obj_id not in _warned_once:
|
|
43
|
+
default_msg = f"类 {obj.__name__} 已不建议使用。"
|
|
44
|
+
warn_msg = f"{default_msg} {message}" if message else default_msg
|
|
45
|
+
warnings.warn(
|
|
46
|
+
warn_msg,
|
|
47
|
+
category=DeprecationWarning,
|
|
48
|
+
stacklevel=2
|
|
49
|
+
)
|
|
50
|
+
_warned_once.add(obj_id) # 记录已警告
|
|
51
|
+
orig_init(self, *args, **kwargs)
|
|
52
|
+
|
|
53
|
+
obj.__init__ = new_init
|
|
54
|
+
return obj
|
|
55
|
+
|
|
56
|
+
else:
|
|
57
|
+
raise TypeError("此装饰器仅适用于函数和类")
|
|
58
|
+
|
|
59
|
+
return decorator
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
|
|
3
|
+
import ahocorasick
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ACTool(object):
|
|
7
|
+
|
|
8
|
+
def __init__(self):
|
|
9
|
+
self.automaton = ahocorasick.Automaton()
|
|
10
|
+
|
|
11
|
+
def add_word(self, key, value, overwrite=True) -> bool:
|
|
12
|
+
"""
|
|
13
|
+
为 AC 机添加数据,默认情况下 key重复直接覆盖
|
|
14
|
+
:param key: 要添加的关键字
|
|
15
|
+
:param value: 对应的值
|
|
16
|
+
:param overwrite: 是否覆盖已有的 key,默认为 True
|
|
17
|
+
:return: 是否成功添加或覆盖
|
|
18
|
+
"""
|
|
19
|
+
if key in self.automaton: # 检查 key 是否已存在
|
|
20
|
+
if overwrite: # 如果允许覆盖
|
|
21
|
+
self.automaton.add_word(key, value)
|
|
22
|
+
return True
|
|
23
|
+
else: # 不允许覆盖,跳过
|
|
24
|
+
return False
|
|
25
|
+
else: # key 不存在,直接添加
|
|
26
|
+
self.automaton.add_word(key, value)
|
|
27
|
+
return True
|
|
28
|
+
|
|
29
|
+
def is_exists_key(self, key) -> bool:
|
|
30
|
+
# 是否存在key
|
|
31
|
+
if self.automaton.exists(key):
|
|
32
|
+
return True
|
|
33
|
+
else:
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
def make_automaton(self):
|
|
37
|
+
"""
|
|
38
|
+
添加完词后需要构建
|
|
39
|
+
"""
|
|
40
|
+
self.automaton.make_automaton()
|
|
41
|
+
|
|
42
|
+
def iter(self, key):
|
|
43
|
+
"""
|
|
44
|
+
结果为可迭代对象 可通过list 转换 [(end_index, value)]
|
|
45
|
+
tool.add_word("he", "word1")
|
|
46
|
+
tool.add_word("hello", "word2")
|
|
47
|
+
|
|
48
|
+
# 在字符串中查找匹配
|
|
49
|
+
input_string = "hello world"
|
|
50
|
+
matches = list(tool.automaton.iter(input_string))
|
|
51
|
+
print(matches) # [(1, 'word1'), (4, 'word2')]
|
|
52
|
+
|
|
53
|
+
(1, 'word1'):
|
|
54
|
+
end_index = 1: 表示匹配的关键字 "he" 在 input_string = "hello world" 中的结束位置是索引 1(即字符串 "he" 的最后一个字符 'e' 的位置)。
|
|
55
|
+
"hello world" 的索引:h(0)e(1)l(2)l(3)o(4) (5)w(6)o(7)r(8)l(9)d(10)。
|
|
56
|
+
value = 'word1': 表示匹配的关键字 "he" 对应的值是 "word1"。
|
|
57
|
+
(4, 'word2'):
|
|
58
|
+
end_index = 4: 表示匹配的关键字 "hello" 在 input_string = "hello world" 中的结束位置是索引 4(即字符串 "hello" 的最后一个字符 'o' 的位置)。
|
|
59
|
+
value = 'word2': 表示匹配的关键字 "hello" 对应的值是 "word2"。
|
|
60
|
+
|
|
61
|
+
注意: 结果只会返回 value 不会返回 key,如果需要key 请将key 组合到结果中
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
result_iter = self.automaton.iter(key) # ahocorasick.AutomatonSearchIter
|
|
65
|
+
return result_iter
|
|
66
|
+
def save(self,local_temp_path):
|
|
67
|
+
"""
|
|
68
|
+
将构建好的ac自动机保存到本地
|
|
69
|
+
"""
|
|
70
|
+
self.automaton.save(local_temp_path,pickle.dumps)
|
|
71
|
+
|
|
72
|
+
def load(self,local_temp_path):
|
|
73
|
+
"""
|
|
74
|
+
加载已经构建好的ac自动机
|
|
75
|
+
"""
|
|
76
|
+
self.automaton=ahocorasick.load(local_temp_path, pickle.loads)
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import gzip
|
|
3
|
+
import json
|
|
4
|
+
import sqlite3
|
|
5
|
+
import time
|
|
6
|
+
import os
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from typing import Callable, Any, List
|
|
9
|
+
|
|
10
|
+
from hdfs import InsecureClient
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HDFSDataProcessor:
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
hdfs_url="http://VIP-DC-MASTER-2:9870",
|
|
17
|
+
hdfs_user="root",
|
|
18
|
+
db_file="processed_files.db",
|
|
19
|
+
batch_size=50,
|
|
20
|
+
retry_limit=3,
|
|
21
|
+
):
|
|
22
|
+
self.hdfs_url = hdfs_url
|
|
23
|
+
self.hdfs_user = hdfs_user
|
|
24
|
+
self.db_file = db_file
|
|
25
|
+
self.batch_size = batch_size
|
|
26
|
+
self.retry_limit = retry_limit
|
|
27
|
+
self.client = InsecureClient(self.hdfs_url, user=self.hdfs_user)
|
|
28
|
+
self.read_hdfs_fanc = {"all": self.all_read_gz, "batch": self.batch_read_gz}
|
|
29
|
+
self.read_hdfs_model = "all"
|
|
30
|
+
self.init_db()
|
|
31
|
+
|
|
32
|
+
def init_db(self):
|
|
33
|
+
"""初始化 SQLite 数据库"""
|
|
34
|
+
with sqlite3.connect(self.db_file) as conn:
|
|
35
|
+
cursor = conn.cursor()
|
|
36
|
+
cursor.execute("""
|
|
37
|
+
CREATE TABLE IF NOT EXISTS processed_files (
|
|
38
|
+
file_path TEXT PRIMARY KEY
|
|
39
|
+
)
|
|
40
|
+
""")
|
|
41
|
+
conn.commit()
|
|
42
|
+
|
|
43
|
+
def save_processed_file(self, file_path):
|
|
44
|
+
"""保存处理过的文件"""
|
|
45
|
+
with sqlite3.connect(self.db_file) as conn:
|
|
46
|
+
cursor = conn.cursor()
|
|
47
|
+
cursor.execute("INSERT OR IGNORE INTO processed_files (file_path) VALUES (?)", (file_path,))
|
|
48
|
+
conn.commit()
|
|
49
|
+
|
|
50
|
+
def is_file_processed(self, file_path):
|
|
51
|
+
"""检查文件是否已处理"""
|
|
52
|
+
with sqlite3.connect(self.db_file) as conn:
|
|
53
|
+
cursor = conn.cursor()
|
|
54
|
+
cursor.execute("SELECT file_path FROM processed_files WHERE file_path = ?", (file_path,))
|
|
55
|
+
result = cursor.fetchone()
|
|
56
|
+
return result is not None
|
|
57
|
+
|
|
58
|
+
def list_gz_files(self, hdfs_dir):
|
|
59
|
+
"""列出 HDFS 目录中的所有 gzip 文件"""
|
|
60
|
+
return [f"{hdfs_dir}/{file[0]}" for file in self.client.list(hdfs_dir, status=True) if file[0].endswith(".gz")]
|
|
61
|
+
|
|
62
|
+
def count_total_lines(self, gz_file_path: str):
|
|
63
|
+
with self.client.read(gz_file_path) as hdfs_file:
|
|
64
|
+
with gzip.GzipFile(fileobj=hdfs_file) as gz:
|
|
65
|
+
return sum(1 for _ in gz)
|
|
66
|
+
|
|
67
|
+
def batch_read_gz(self, gz_file_path: str):
|
|
68
|
+
"""分批读取 gz 文件"""
|
|
69
|
+
with self.client.read(gz_file_path) as hdfs_file:
|
|
70
|
+
with gzip.GzipFile(fileobj=hdfs_file) as gz:
|
|
71
|
+
while True:
|
|
72
|
+
lines = []
|
|
73
|
+
for _ in range(self.batch_size):
|
|
74
|
+
try:
|
|
75
|
+
line = next(gz)
|
|
76
|
+
if line.strip(): # 移除空行
|
|
77
|
+
lines.append(line.decode("utf-8")) # 解码
|
|
78
|
+
except StopIteration: # 文件已读完
|
|
79
|
+
break
|
|
80
|
+
if not lines:
|
|
81
|
+
break
|
|
82
|
+
yield lines
|
|
83
|
+
|
|
84
|
+
def all_read_gz(self, gz_file_path: str, encoding='utf-8'):
|
|
85
|
+
"""
|
|
86
|
+
读取 HDFS 上的 .gz 文件内容。
|
|
87
|
+
:param hdfs_path: HDFS 文件路径(必须以 .gz 结尾)
|
|
88
|
+
:param encoding: 文件编码格式(默认 utf-8)
|
|
89
|
+
:return: 文件内容
|
|
90
|
+
"""
|
|
91
|
+
with self.client.read(gz_file_path) as reader: # 以二进制模式读取
|
|
92
|
+
compressed_data = reader.read() # 读取压缩数据
|
|
93
|
+
with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file: # 解压缩
|
|
94
|
+
content = gz_file.read().decode(encoding) # 解码为字符串
|
|
95
|
+
print(f"文件读取成功: {gz_file_path}")
|
|
96
|
+
lines = [i for i in content.splitlines() if i.strip()]
|
|
97
|
+
result = [lines[i:i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
98
|
+
return result
|
|
99
|
+
|
|
100
|
+
async def process_data(self, data, process_func):
|
|
101
|
+
"""处理数据并执行处理函数"""
|
|
102
|
+
retry_count = 0
|
|
103
|
+
while retry_count < self.retry_limit:
|
|
104
|
+
try:
|
|
105
|
+
await process_func(data)
|
|
106
|
+
return # 成功处理后退出
|
|
107
|
+
except Exception as e:
|
|
108
|
+
retry_count += 1
|
|
109
|
+
print(f"处理数据时发生错误: {e}, 正在重试 {retry_count}/{self.retry_limit}, data: {data}")
|
|
110
|
+
await asyncio.sleep(2 ** retry_count)
|
|
111
|
+
print(f"处理数据失败, 达到重试上限, data: {data}")
|
|
112
|
+
|
|
113
|
+
async def process_file(self, hdfs_file_path, process_func):
|
|
114
|
+
"""处理单个 gz 文件"""
|
|
115
|
+
total_lines = self.count_total_lines(hdfs_file_path)
|
|
116
|
+
processed_lines = 0
|
|
117
|
+
start_time = time.time()
|
|
118
|
+
# # 这里根据不同的配置选用不同的读取文件的方法
|
|
119
|
+
for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
|
|
120
|
+
processing_start_time = time.time() # 记录本批处理开始时间
|
|
121
|
+
|
|
122
|
+
tasks = []
|
|
123
|
+
for line in lines:
|
|
124
|
+
try:
|
|
125
|
+
data = json.loads(line)
|
|
126
|
+
tasks.append(self.process_data(data, process_func))
|
|
127
|
+
except json.JSONDecodeError as e:
|
|
128
|
+
print(f"解析JSON失败: {e}, 行内容: {line.strip()}")
|
|
129
|
+
|
|
130
|
+
# await AsyncTaskPool(self.batch_size).run(tasks) # AsyncTaskPool 适用于一次提交所有任务, 限制并发数执行
|
|
131
|
+
await asyncio.gather(*tasks)
|
|
132
|
+
|
|
133
|
+
processed_lines += len(lines)
|
|
134
|
+
|
|
135
|
+
elapsed_time = time.time() - start_time # 已用时间
|
|
136
|
+
processing_time = time.time() - processing_start_time # 本次处理时间
|
|
137
|
+
avg_processing_time = (
|
|
138
|
+
(elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
|
|
139
|
+
) # 平均每条数据的处理时间(毫秒)
|
|
140
|
+
|
|
141
|
+
# 估算剩余时间
|
|
142
|
+
remaining_time = (
|
|
143
|
+
((avg_processing_time / 1000) * (total_lines - processed_lines))
|
|
144
|
+
if processed_lines > 0
|
|
145
|
+
else float("inf")
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# 显示总进度信息
|
|
149
|
+
print(
|
|
150
|
+
f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
|
|
151
|
+
f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
|
|
152
|
+
f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# 最终进度显示
|
|
156
|
+
final_elapsed_time = time.time() - start_time # 最终已用时间
|
|
157
|
+
print(
|
|
158
|
+
f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
|
|
159
|
+
f"总已用时间: {final_elapsed_time:.2f}秒 | "
|
|
160
|
+
f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
|
|
161
|
+
if processed_lines > 0
|
|
162
|
+
else "处理无数据"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
self.save_processed_file(hdfs_file_path) # 保存处理过的文件
|
|
166
|
+
|
|
167
|
+
async def retry_process_file(self, hdfs_file_path, process_func):
|
|
168
|
+
"""带重试机制的文件处理"""
|
|
169
|
+
retry_count = 0
|
|
170
|
+
while retry_count < self.retry_limit:
|
|
171
|
+
try:
|
|
172
|
+
await self.process_file(hdfs_file_path, process_func)
|
|
173
|
+
return True # 成功处理后退出
|
|
174
|
+
except Exception as e:
|
|
175
|
+
retry_count += 1
|
|
176
|
+
print(f"处理文件 {hdfs_file_path} 时发生错误: {e},正在重试 {retry_count}/{self.retry_limit}")
|
|
177
|
+
await asyncio.sleep(2 ** retry_count)
|
|
178
|
+
print(f"处理文件 {hdfs_file_path} 失败,达到重试上限")
|
|
179
|
+
return False
|
|
180
|
+
# raise
|
|
181
|
+
|
|
182
|
+
async def batch_process_file(self, hdfs_dir: str, process_func: Callable[[dict], Any]):
|
|
183
|
+
"""批量更新所有 gz 文件"""
|
|
184
|
+
gz_files = self.list_gz_files(hdfs_dir)
|
|
185
|
+
all_succeed = True
|
|
186
|
+
for hdfs_file_path in gz_files:
|
|
187
|
+
if self.is_file_processed(hdfs_file_path):
|
|
188
|
+
print(f"跳过已处理文件: {hdfs_file_path}")
|
|
189
|
+
continue # 如果文件已处理,跳过
|
|
190
|
+
succeed = await self.retry_process_file(hdfs_file_path, process_func) # 处理文件
|
|
191
|
+
if succeed is False:
|
|
192
|
+
all_succeed = False
|
|
193
|
+
|
|
194
|
+
if all_succeed:
|
|
195
|
+
# 处理完成后删除数据库文件
|
|
196
|
+
try:
|
|
197
|
+
if os.path.exists(self.db_file):
|
|
198
|
+
os.remove(self.db_file)
|
|
199
|
+
print(f"已删除断点重试文件: {self.db_file}")
|
|
200
|
+
except Exception as e:
|
|
201
|
+
print(f"删除断点重试文件失败: {e}")
|
|
202
|
+
|
|
203
|
+
async def process_file_bulk(self, hdfs_file_path, process_func):
|
|
204
|
+
"""按批次处理单个文件,批量数据传递给处理函数"""
|
|
205
|
+
total_lines = self.count_total_lines(hdfs_file_path)
|
|
206
|
+
processed_lines = 0
|
|
207
|
+
start_time = time.time()
|
|
208
|
+
|
|
209
|
+
tasks = []
|
|
210
|
+
# 这里根据不同的配置选用不同的读取文件的方法
|
|
211
|
+
for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
|
|
212
|
+
processing_start_time = time.time() # 记录本批处理开始时间
|
|
213
|
+
|
|
214
|
+
batch_data = []
|
|
215
|
+
for line in lines:
|
|
216
|
+
try:
|
|
217
|
+
data = json.loads(line)
|
|
218
|
+
batch_data.append(data)
|
|
219
|
+
except json.JSONDecodeError as e:
|
|
220
|
+
print(f"解析JSON失败: {e}, 行内容: {line.strip()}")
|
|
221
|
+
|
|
222
|
+
# 处理读取到的批次数据
|
|
223
|
+
if batch_data:
|
|
224
|
+
tasks.append(process_func(batch_data)) # 将批次数据传递给处理函数并收集任务
|
|
225
|
+
processed_lines += len(batch_data) # 更新已处理行数
|
|
226
|
+
|
|
227
|
+
# 当积累的任务数量达到 batch_size 时并发处理所有任务
|
|
228
|
+
if len(tasks) >= self.batch_size:
|
|
229
|
+
await asyncio.gather(*tasks) # 同时处理多个批次
|
|
230
|
+
|
|
231
|
+
elapsed_time = time.time() - start_time # 已用时间
|
|
232
|
+
processing_time = time.time() - processing_start_time # 本次处理时间
|
|
233
|
+
avg_processing_time = (
|
|
234
|
+
(elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
|
|
235
|
+
) # 平均每条数据的处理时间(毫秒)
|
|
236
|
+
|
|
237
|
+
# 估算剩余时间
|
|
238
|
+
remaining_time = (
|
|
239
|
+
((avg_processing_time / 1000) * (total_lines - processed_lines))
|
|
240
|
+
if processed_lines > 0
|
|
241
|
+
else float("inf")
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# 显示总进度信息
|
|
245
|
+
print(
|
|
246
|
+
f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
|
|
247
|
+
f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
|
|
248
|
+
f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# 清空任务列表,准备下一批处理
|
|
252
|
+
tasks.clear()
|
|
253
|
+
# 处理剩余的任务
|
|
254
|
+
if tasks:
|
|
255
|
+
await asyncio.gather(*tasks) # 处理未达到 batch_size 的剩余任务
|
|
256
|
+
|
|
257
|
+
# 最终进度显示
|
|
258
|
+
final_elapsed_time = time.time() - start_time # 最终已用时间
|
|
259
|
+
print(
|
|
260
|
+
f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
|
|
261
|
+
f"总已用时间: {final_elapsed_time:.2f}秒 | "
|
|
262
|
+
f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
|
|
263
|
+
if processed_lines > 0
|
|
264
|
+
else "处理无数据"
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
self.save_processed_file(hdfs_file_path)
|
|
268
|
+
|
|
269
|
+
async def retry_process_file_bulk(self, hdfs_file_path, process_func):
|
|
270
|
+
"""带重试机制的批量文件处理"""
|
|
271
|
+
retry_count = 0
|
|
272
|
+
while retry_count < self.retry_limit:
|
|
273
|
+
try:
|
|
274
|
+
await self.process_file_bulk(hdfs_file_path, process_func)
|
|
275
|
+
return True # 成功处理后退出
|
|
276
|
+
except Exception as e:
|
|
277
|
+
retry_count += 1
|
|
278
|
+
print(f"处理文件 {hdfs_file_path} 时发生错误: {e},正在重试 {retry_count}/{self.retry_limit}")
|
|
279
|
+
await asyncio.sleep(2 ** retry_count)
|
|
280
|
+
print(f"处理文件 {hdfs_file_path} 失败,达到重试上限")
|
|
281
|
+
return False
|
|
282
|
+
|
|
283
|
+
async def batch_process_file_bulk(self, hdfs_dir: str, process_func: Callable[[List[dict]], Any]):
|
|
284
|
+
"""批量处理 gz 文件中的数据"""
|
|
285
|
+
gz_files = self.list_gz_files(hdfs_dir)
|
|
286
|
+
all_succeed = True
|
|
287
|
+
for hdfs_file_path in gz_files:
|
|
288
|
+
if self.is_file_processed(hdfs_file_path):
|
|
289
|
+
print(f"跳过已处理文件: {hdfs_file_path}")
|
|
290
|
+
continue # 跳过已处理文件
|
|
291
|
+
succeed = await self.retry_process_file_bulk(hdfs_file_path, process_func)
|
|
292
|
+
if succeed is False:
|
|
293
|
+
all_succeed = False
|
|
294
|
+
|
|
295
|
+
if all_succeed:
|
|
296
|
+
# 处理完成后删除数据库文件
|
|
297
|
+
try:
|
|
298
|
+
if os.path.exists(self.db_file):
|
|
299
|
+
os.remove(self.db_file)
|
|
300
|
+
print(f"已删除断点重试文件: {self.db_file}")
|
|
301
|
+
except Exception as e:
|
|
302
|
+
print(f"删除断点重试文件失败: {e}")
|
|
303
|
+
|
|
304
|
+
# # 使用示例
|
|
305
|
+
# async def update_refer(data: dict):
|
|
306
|
+
# ref_id = data["ref_id"]
|
|
307
|
+
# url = f"http://192.168.98.79:8150/v1/fact_refer/update/{ref_id}"
|
|
308
|
+
# update_data = data["update_data"]
|
|
309
|
+
# if not update_data:
|
|
310
|
+
# return
|
|
311
|
+
#
|
|
312
|
+
# # 此处为实际处理逻辑
|
|
313
|
+
# await ApiNetUtils.fetch_post(url=url, payload=update_data)
|
|
314
|
+
#
|
|
315
|
+
#
|
|
316
|
+
# if __name__ == "__main__":
|
|
317
|
+
# processor = HDFSDataProcessor() # 实例化数据处理类
|
|
318
|
+
# asyncio.run(processor.batch_process_file("/user/libaiyun/output/confidence", update_refer))
|
|
@@ -3,9 +3,10 @@ from typing import List
|
|
|
3
3
|
import jieba
|
|
4
4
|
from datasketch import MinHash, minhash
|
|
5
5
|
|
|
6
|
+
from re_common.v2.baselibrary.decorators.utils import deprecated
|
|
6
7
|
from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
|
|
7
8
|
|
|
8
|
-
|
|
9
|
+
@deprecated("请使用 TextMatcherV2 中的方法代替。")
|
|
9
10
|
def tokenize(text: str, stopwords=None) -> List[str]:
|
|
10
11
|
"""
|
|
11
12
|
分词并移除停用词
|
|
@@ -32,7 +33,7 @@ def tokenize(text: str, stopwords=None) -> List[str]:
|
|
|
32
33
|
words = [w for w in words if w not in stopwords and w.strip()]
|
|
33
34
|
return words
|
|
34
35
|
|
|
35
|
-
|
|
36
|
+
@deprecated("请使用 TextMatcherV2 中的方法代替。")
|
|
36
37
|
def create_minhash(words: List[str], num_perm=128) -> MinHash:
|
|
37
38
|
"""
|
|
38
39
|
为分词结果创建 MinHash
|
|
@@ -42,7 +43,7 @@ def create_minhash(words: List[str], num_perm=128) -> MinHash:
|
|
|
42
43
|
minhash.update(word.encode("utf-8"))
|
|
43
44
|
return minhash
|
|
44
45
|
|
|
45
|
-
|
|
46
|
+
@deprecated("请使用 TextMatcherV2 中的方法代替。")
|
|
46
47
|
def get_str_minhash(title):
|
|
47
48
|
from re_common.v2.baselibrary.utils.string_clear import rel_clear
|
|
48
49
|
rel_title = rel_clear(title)
|
|
@@ -1,10 +1,16 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
|
|
1
3
|
import jieba
|
|
2
4
|
import re
|
|
3
|
-
from typing import List, Dict, Tuple, Set, Optional, Union
|
|
5
|
+
from typing import List, Dict, Tuple, Set, Optional, Union, Hashable, Protocol
|
|
4
6
|
from datasketch import MinHash, MinHashLSH
|
|
5
7
|
|
|
8
|
+
from re_common.v2.baselibrary.decorators.utils import deprecated
|
|
9
|
+
from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
|
|
10
|
+
|
|
6
11
|
|
|
7
|
-
|
|
12
|
+
@deprecated("请使用 TextMatcherV2 代替。")
|
|
13
|
+
class TextMatcher(object):
|
|
8
14
|
def __init__(
|
|
9
15
|
self,
|
|
10
16
|
threshold: float = 0.5,
|
|
@@ -188,36 +194,133 @@ class TextMatcher:
|
|
|
188
194
|
self.doc_counter = 0
|
|
189
195
|
|
|
190
196
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
+
# 定义一个协议,描述“像鸭子一样”的行为
|
|
198
|
+
class TokenizeDuckLike(Protocol):
|
|
199
|
+
def get_words(self, text) -> List:
|
|
200
|
+
pass
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class JiebaTokenize(object):
|
|
204
|
+
|
|
205
|
+
def __init__(self, stopwords=None):
|
|
206
|
+
self.stopwords = stopwords
|
|
207
|
+
|
|
208
|
+
def get_words(self, text) -> List:
|
|
209
|
+
|
|
210
|
+
if self.stopwords is None:
|
|
211
|
+
stopwords = []
|
|
212
|
+
words = jieba.lcut(text)
|
|
213
|
+
|
|
214
|
+
# 统计单字符数据 长度,防止结巴分词分不了的单词 将数据分为单个字符
|
|
215
|
+
|
|
216
|
+
# 这里为什么使用函数 而不是在推导式中兼容,主要是在一些 spark中 推导式的if 条件不遵循最短路径原则会将表达式当做一个整体算子
|
|
217
|
+
def is_singel_en(i):
|
|
218
|
+
if len(i) == 1 and not is_single_cjk_char(i):
|
|
219
|
+
return True
|
|
220
|
+
return False
|
|
221
|
+
|
|
222
|
+
one_char_size = len([i for i in words if is_singel_en(i)])
|
|
223
|
+
all_size = len(words)
|
|
224
|
+
# 如果单字符个数超过一定比例 就直接用空格分词
|
|
225
|
+
if all_size != 0 and one_char_size / all_size > 0.6:
|
|
226
|
+
words = [i for i in text.split() if i.strip()]
|
|
227
|
+
|
|
228
|
+
# 过滤停用词和空字符
|
|
229
|
+
words = [w for w in words if w not in stopwords and w.strip()]
|
|
230
|
+
return words
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class TextMatcherV2(object):
|
|
234
|
+
|
|
235
|
+
def __init__(
|
|
236
|
+
self,
|
|
237
|
+
threshold: float = 0.5,
|
|
238
|
+
num_perm: int = 128,
|
|
239
|
+
tdk: TokenizeDuckLike = None
|
|
240
|
+
):
|
|
241
|
+
"""
|
|
242
|
+
初始化文本匹配器
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
threshold: LSH 相似度阈值
|
|
246
|
+
num_perm: MinHash 排列数
|
|
247
|
+
stopwords_path: 停用词文件路径
|
|
248
|
+
user_dict_path: 用户自定义词典路径
|
|
249
|
+
"""
|
|
250
|
+
self.threshold = threshold
|
|
251
|
+
self.num_perm = num_perm
|
|
252
|
+
self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
|
|
253
|
+
self.tdk = tdk
|
|
254
|
+
|
|
255
|
+
def add_document(self, doc_id: str, minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None):
|
|
256
|
+
if isinstance(minhash, str):
|
|
257
|
+
minhash = self.str_to_minihash(minhash, tdk)
|
|
197
258
|
|
|
198
|
-
|
|
199
|
-
doc_id = matcher.add_document(
|
|
200
|
-
"北京是中国的首都"
|
|
201
|
-
)
|
|
259
|
+
self.lsh.insert(doc_id, minhash)
|
|
202
260
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
261
|
+
def batch_add_documents(self, betch_data: Union[list, dict], tdk: TokenizeDuckLike = None):
|
|
262
|
+
def _add_document(minhash_or_str, tdk):
|
|
263
|
+
if isinstance(minhash_or_str, str):
|
|
264
|
+
minhash_or_str = self.str_to_minihash(minhash_or_str, tdk)
|
|
265
|
+
self.add_document(docid, minhash_or_str, tdk)
|
|
266
|
+
|
|
267
|
+
if isinstance(betch_data, list):
|
|
268
|
+
# 必须是可解包的2个数据的元组或list
|
|
269
|
+
for docid, minhash_or_str in betch_data:
|
|
270
|
+
_add_document(minhash_or_str, tdk)
|
|
271
|
+
elif isinstance(betch_data, dict):
|
|
272
|
+
for docid, minhash_or_str in betch_data.items():
|
|
273
|
+
_add_document(minhash_or_str, tdk)
|
|
274
|
+
else:
|
|
275
|
+
raise Exception("数据类型错误")
|
|
276
|
+
|
|
277
|
+
def find_similar(self, query_minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None) -> List[Hashable]:
|
|
278
|
+
# 使用 LSH 查找候选集
|
|
279
|
+
if isinstance(query_minhash, str):
|
|
280
|
+
query_minhash = self.str_to_minihash(query_minhash, tdk)
|
|
281
|
+
similar_docs = self.lsh.query(query_minhash)
|
|
282
|
+
return similar_docs
|
|
206
283
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
284
|
+
def create_minhash(self, words: List[str], num_perm=None) -> MinHash:
|
|
285
|
+
"""
|
|
286
|
+
为分词结果创建 MinHash
|
|
287
|
+
"""
|
|
288
|
+
if num_perm is None:
|
|
289
|
+
num_perm = self.num_perm
|
|
290
|
+
minhash = MinHash(num_perm=num_perm)
|
|
291
|
+
for word in words:
|
|
292
|
+
minhash.update(word.encode("utf-8"))
|
|
293
|
+
return minhash
|
|
210
294
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
295
|
+
def create_words(self, text: str, tdk: TokenizeDuckLike = None):
|
|
296
|
+
if tdk is None:
|
|
297
|
+
tdk = self.tdk
|
|
298
|
+
worlds = tdk.get_words(text)
|
|
299
|
+
return worlds
|
|
300
|
+
|
|
301
|
+
def str_to_minihash(self, text: str, tdk: TokenizeDuckLike = None):
|
|
302
|
+
if tdk is None:
|
|
303
|
+
tdk = self.tdk
|
|
304
|
+
words = self.create_words(text, tdk)
|
|
305
|
+
minhash = self.create_minhash(words, self.num_perm)
|
|
306
|
+
return minhash
|
|
214
307
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
308
|
+
def minhash_dumps(self, minhash) -> bytes:
|
|
309
|
+
"""
|
|
310
|
+
序列化
|
|
311
|
+
"""
|
|
312
|
+
serialized_minhash = pickle.dumps(minhash)
|
|
313
|
+
return serialized_minhash
|
|
218
314
|
|
|
219
|
-
|
|
220
|
-
|
|
315
|
+
def minhash_loads(self, serialized_minhash) -> MinHash:
|
|
316
|
+
"""
|
|
317
|
+
反序列化
|
|
318
|
+
"""
|
|
319
|
+
minhash = pickle.loads(serialized_minhash)
|
|
320
|
+
return minhash
|
|
221
321
|
|
|
222
|
-
|
|
223
|
-
|
|
322
|
+
def merge_other_minhashlsh(self, other_minhashlsh: MinHashLSH):
|
|
323
|
+
"""
|
|
324
|
+
在其他地方创建好的lsh 合并进来
|
|
325
|
+
"""
|
|
326
|
+
self.lsh.merge(other_minhashlsh)
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# 某些业务中的字符串处理 算是特定场景的工具 不算通用工具
|
|
2
2
|
import re
|
|
3
3
|
|
|
4
|
+
from re_common.v2.baselibrary.utils.string_bool import is_all_symbols
|
|
5
|
+
|
|
4
6
|
|
|
5
7
|
def clean_organ_postcode(organ):
|
|
6
8
|
"""
|
|
@@ -120,6 +122,11 @@ def deal_rel_vol(vol_str: str):
|
|
|
120
122
|
"""
|
|
121
123
|
处理 期刊融合时的卷处理逻辑
|
|
122
124
|
"""
|
|
125
|
+
|
|
126
|
+
# 如果卷是全符号 清理掉
|
|
127
|
+
if is_all_symbols(vol_str):
|
|
128
|
+
vol_str = ""
|
|
129
|
+
|
|
123
130
|
if vol_str.replace(".", "").isdigit():
|
|
124
131
|
try:
|
|
125
132
|
float_num = float(vol_str)
|
|
@@ -156,20 +163,27 @@ def deal_num_strs(input_str):
|
|
|
156
163
|
return input_str
|
|
157
164
|
|
|
158
165
|
|
|
159
|
-
def deal_num(
|
|
166
|
+
def deal_num(num_str):
|
|
160
167
|
"""
|
|
161
168
|
将 期格式化 方便 group尤其是有横杆的数据
|
|
162
169
|
该方法 为融合二次分割时使用,如果场景合适也可以用于其他地方
|
|
163
170
|
:param strs:
|
|
164
171
|
:return:
|
|
165
172
|
"""
|
|
166
|
-
|
|
167
|
-
if
|
|
168
|
-
|
|
173
|
+
# 如果期是全符号清理掉
|
|
174
|
+
if is_all_symbols(num_str):
|
|
175
|
+
num_str = ""
|
|
176
|
+
|
|
177
|
+
if num_str.lower().startswith("n "):
|
|
178
|
+
num_str = num_str.lower().replace("n ", "").strip()
|
|
179
|
+
|
|
180
|
+
num_str = num_str.replace("-", "_").replace(".", "_").upper()
|
|
181
|
+
if num_str.find("_") > -1:
|
|
182
|
+
start, end = num_str.split("_")
|
|
169
183
|
start = deal_num_strs(start)
|
|
170
184
|
end = deal_num_strs(end)
|
|
171
|
-
|
|
185
|
+
num_str = start + "_" + end
|
|
172
186
|
else:
|
|
173
|
-
|
|
187
|
+
num_str = deal_num_strs(num_str)
|
|
174
188
|
|
|
175
|
-
return
|
|
189
|
+
return num_str
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import pika
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
# https://blog.csdn.net/songfreeman/article/details/50943603
|
|
5
|
+
class BasePika(object):
|
|
6
|
+
|
|
7
|
+
def __init__(self, username=None, password=None, mqhost=None, virtual_host=None):
|
|
8
|
+
self.username = username
|
|
9
|
+
self.password = password
|
|
10
|
+
self.conn = None
|
|
11
|
+
self.host = mqhost
|
|
12
|
+
self.virtual_host = virtual_host
|
|
13
|
+
self.auto_ack = True
|
|
14
|
+
|
|
15
|
+
def set_default(self):
|
|
16
|
+
self.host = "192.168.31.79"
|
|
17
|
+
self.virtual_host = "vhost_NetDataGather"
|
|
18
|
+
self.username = "vip"
|
|
19
|
+
self.password = "piv$*123"
|
|
20
|
+
|
|
21
|
+
def connect_str(self,amqp_str):
|
|
22
|
+
parameters = pika.URLParameters(amqp_str)
|
|
23
|
+
self.conn = pika.BlockingConnection(parameters)
|
|
24
|
+
|
|
25
|
+
def connect(self):
|
|
26
|
+
"""
|
|
27
|
+
设置用户名 密码 进行连接
|
|
28
|
+
:return:
|
|
29
|
+
"""
|
|
30
|
+
credentials = pika.PlainCredentials(self.username, self.password)
|
|
31
|
+
# parameters = pika.URLParameters('amqp://guest:guest@rabbit-server1:5672/%2F')
|
|
32
|
+
# 可以通过将 heartbeat 设为 0,关闭 rabbitmq 的心跳检测
|
|
33
|
+
parameters = pika.ConnectionParameters(host=self.host,
|
|
34
|
+
virtual_host=self.virtual_host,
|
|
35
|
+
credentials=credentials,
|
|
36
|
+
heartbeat=0)
|
|
37
|
+
self.conn = pika.BlockingConnection(parameters)
|
|
38
|
+
|
|
39
|
+
def close(self):
|
|
40
|
+
# 关闭消息队列
|
|
41
|
+
self.conn.close()
|
|
42
|
+
|
|
43
|
+
def create_channel(self):
|
|
44
|
+
self.channel = self.conn.channel()
|
|
45
|
+
|
|
46
|
+
def __del__(self):
|
|
47
|
+
self.channel.close()
|
|
48
|
+
self.conn.close()
|
|
49
|
+
|
|
50
|
+
def random_queue_declare(self):
|
|
51
|
+
"""
|
|
52
|
+
这样, result.method.queue 包含一个随机的队列名, 比如:看起来像 amq.gen-JzTY20BRgKO-HjmUJj0wLg.
|
|
53
|
+
其次:
|
|
54
|
+
一旦我们断开consumer连接,这个队列名将自动删除。这里有一个标识设置:
|
|
55
|
+
:return:
|
|
56
|
+
"""
|
|
57
|
+
return self.channel.queue_declare("", exclusive=True)
|
|
58
|
+
|
|
59
|
+
def queue_declare(self, queue="hello", durable=False):
|
|
60
|
+
"""
|
|
61
|
+
创建目的地队列hello 取消息时也可以调用
|
|
62
|
+
取消息和发送消息都调用 保证队列存在,也保证了不管服务端还是客户端先启动都有队列
|
|
63
|
+
durable True 为持久化
|
|
64
|
+
:return:
|
|
65
|
+
"""
|
|
66
|
+
return self.channel.queue_declare(queue=queue, durable=durable)
|
|
67
|
+
|
|
68
|
+
def get_queue_size(self, queue="hello"):
|
|
69
|
+
"""
|
|
70
|
+
获取某个队列的长度
|
|
71
|
+
:param queue:
|
|
72
|
+
:return:
|
|
73
|
+
"""
|
|
74
|
+
queue = self.queue_declare(queue=queue, durable=True)
|
|
75
|
+
return queue.method.message_count
|
|
76
|
+
|
|
77
|
+
def get_properties(self):
|
|
78
|
+
"""
|
|
79
|
+
与 queue_declare里的 durable = True 配合使用,
|
|
80
|
+
设置给 easy_send_msg的properties
|
|
81
|
+
:return:
|
|
82
|
+
"""
|
|
83
|
+
return pika.BasicProperties(
|
|
84
|
+
delivery_mode=2, # 设置消息为持久化的
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def easy_send_msg(self, exchange="", routing_key="hello", body="hello world", properties=None):
|
|
88
|
+
"""
|
|
89
|
+
空字符串标识默认的或者匿名的exchange,如果存在routing_key, 消息路由到routing_key指定的队列中。
|
|
90
|
+
routing_key 标识发送到哪个队列,就是服务器上的队列名
|
|
91
|
+
body 发送的消息
|
|
92
|
+
|
|
93
|
+
basic_publish 如果 exchange 不是"" 但没有绑定队列 消息会消失
|
|
94
|
+
:return:
|
|
95
|
+
"""
|
|
96
|
+
self.channel.basic_publish(exchange=exchange,
|
|
97
|
+
routing_key=routing_key,
|
|
98
|
+
body=body,
|
|
99
|
+
properties=properties)
|
|
100
|
+
|
|
101
|
+
def basic_ack(self, ch, method):
|
|
102
|
+
"""
|
|
103
|
+
callback的消息确认
|
|
104
|
+
:param ch:
|
|
105
|
+
:param method:
|
|
106
|
+
:return:
|
|
107
|
+
"""
|
|
108
|
+
ch.basic_ack(delivery_tag=method.delivery_tag)
|
|
109
|
+
|
|
110
|
+
def callback(self, ch, method, properties, body):
|
|
111
|
+
"""
|
|
112
|
+
从队列接收消息要更复杂一些,它需要为队列订阅一个 callback 函数来进行接收。
|
|
113
|
+
当我们接收一个消息后,这个 callback 函数将会被 pika函数库自动调用,
|
|
114
|
+
在我们的这个实例里面这个函数将用来打印接收的消息内容到屏幕
|
|
115
|
+
:param method:
|
|
116
|
+
:param properties:
|
|
117
|
+
:param body:
|
|
118
|
+
:return:
|
|
119
|
+
"""
|
|
120
|
+
print(type(body))
|
|
121
|
+
print(" [x] Received %r" % body)
|
|
122
|
+
if self.auto_ack is False:
|
|
123
|
+
self.basic_ack(ch, method)
|
|
124
|
+
|
|
125
|
+
def set_get_msg_callback(self, routing_key="hello", callback=None, auto_ack=True):
|
|
126
|
+
"""
|
|
127
|
+
设置取消息的callback
|
|
128
|
+
no_ack 如果设置为True,将使用自动确认模式
|
|
129
|
+
no_ack 如果设置为False,在callback中确认
|
|
130
|
+
:return:
|
|
131
|
+
"""
|
|
132
|
+
self.auto_ack = auto_ack
|
|
133
|
+
if callback is None:
|
|
134
|
+
callback = self.callback
|
|
135
|
+
self.channel.basic_consume(routing_key,
|
|
136
|
+
callback,
|
|
137
|
+
auto_ack=auto_ack)
|
|
138
|
+
|
|
139
|
+
def start_get_msg(self):
|
|
140
|
+
"""
|
|
141
|
+
开始取消息,会循环不停的取消息
|
|
142
|
+
:return:
|
|
143
|
+
"""
|
|
144
|
+
self.channel.start_consuming()
|
|
145
|
+
|
|
146
|
+
def basic_qos(self, prefetch_count=1):
|
|
147
|
+
"""
|
|
148
|
+
可以提前发送几个消息来,当auto_ack=True时无效
|
|
149
|
+
prefetch_count==1 消息未处理完前不要发送信息的消息
|
|
150
|
+
:return:
|
|
151
|
+
"""
|
|
152
|
+
self.channel.basic_qos(prefetch_count=prefetch_count)
|
|
153
|
+
|
|
154
|
+
def exchange_declare(self, exchangename="logs", type="fanout"):
|
|
155
|
+
"""
|
|
156
|
+
fanout exchange非常简单,你从这个名字中就能猜出来,它将从Producer方收到的消息广播给所有他知道的receiver方。而这正是我们的logger记录所需要的消息。
|
|
157
|
+
交换的类型
|
|
158
|
+
直接交换(direct exchange)的路由算法很简单 -- 消息发送到绑定键值(binding key) 刚好完全符合路由键值( routing key) 的消息队列中。
|
|
159
|
+
|
|
160
|
+
消息发送到一个 topic交换不能是一个任意的 routing_key -- 它必须是一个用小数点 分割的单词列表。 这个字符可以是任何单词,但是通常是指定一些连接特定消息的功能。一些有效的路由键(routing key)比如:“stock.usd.nyse",
|
|
161
|
+
topic 是 直接交换的升级版
|
|
162
|
+
|
|
163
|
+
headers Exchange :headers交换器允许你匹配AMQP消息的header而非路由键。除此之外,headers交换器和direct交换器完全一致,但性能会差很多。因此它并不太实用,而且几乎再也用不到了。
|
|
164
|
+
exchangename接下来会与队列绑定
|
|
165
|
+
direct , topic , headers 和 fanout
|
|
166
|
+
:return:
|
|
167
|
+
"""
|
|
168
|
+
return self.channel.exchange_declare(exchange=exchangename,
|
|
169
|
+
exchange_type=type)
|
|
170
|
+
|
|
171
|
+
def queue_bind(self, exchange="logs", queue="", routing_key=""):
|
|
172
|
+
"""
|
|
173
|
+
queue 临时队列获取 self.random_queue_declare().method.queue
|
|
174
|
+
:param exchange:
|
|
175
|
+
:param queue:
|
|
176
|
+
:return:
|
|
177
|
+
"""
|
|
178
|
+
self.channel.queue_bind(exchange=exchange,
|
|
179
|
+
queue=queue,
|
|
180
|
+
routing_key=routing_key)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from contextlib import asynccontextmanager
|
|
2
|
+
from typing import AsyncGenerator, Tuple
|
|
3
|
+
|
|
4
|
+
import aiomysql
|
|
5
|
+
from aiomysql import Pool, Connection, Cursor
|
|
6
|
+
|
|
7
|
+
DB_CONFIG = {
|
|
8
|
+
'host': '192.168.98.55',
|
|
9
|
+
'port': 4000,
|
|
10
|
+
'user': 'dataware_house_baseUser',
|
|
11
|
+
'password': 'FF19AF831AEBD580B450B16BF9264200',
|
|
12
|
+
'db': 'dataware_house_base',
|
|
13
|
+
'charset': 'utf8mb4',
|
|
14
|
+
'minsize': 16, # 最小连接数
|
|
15
|
+
'maxsize': 128, # 最大连接数
|
|
16
|
+
'autocommit': False, # 自动提交事务
|
|
17
|
+
'pool_recycle': 3600, # 每个连接的回收时间(秒),超过此时间后连接将被关闭并重新创建,避免失效连接
|
|
18
|
+
'echo': False, # 打印SQL语句
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@asynccontextmanager
|
|
23
|
+
async def get_db_pool():
|
|
24
|
+
"""异步数据库连接池管理工具"""
|
|
25
|
+
pool: Pool = await aiomysql.create_pool(**DB_CONFIG)
|
|
26
|
+
try:
|
|
27
|
+
yield pool
|
|
28
|
+
finally:
|
|
29
|
+
pool.close()
|
|
30
|
+
await pool.wait_closed()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@asynccontextmanager
|
|
34
|
+
async def get_session(pool: Pool) -> AsyncGenerator[Tuple[Connection, Cursor], None]:
|
|
35
|
+
"""获取数据库会话"""
|
|
36
|
+
async with pool.acquire() as conn:
|
|
37
|
+
async with conn.cursor() as cursor:
|
|
38
|
+
yield conn, cursor
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import traceback
|
|
3
|
+
|
|
4
|
+
from re_common.v2.baselibrary.utils.basepika import BasePika
|
|
5
|
+
from retry import retry
|
|
6
|
+
|
|
7
|
+
logging_logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class UseMq(object):
|
|
11
|
+
|
|
12
|
+
def __init__(self, queue, qos=1):
|
|
13
|
+
self.queue = queue
|
|
14
|
+
self.qos = qos
|
|
15
|
+
self.basepika = BasePika()
|
|
16
|
+
self.basepika.set_default()
|
|
17
|
+
self.basepika.connect()
|
|
18
|
+
self.basepika.create_channel()
|
|
19
|
+
self.basepika.queue_declare(queue=queue, durable=True)
|
|
20
|
+
self.basepika.basic_qos(qos)
|
|
21
|
+
self.properties = self.basepika.get_properties()
|
|
22
|
+
|
|
23
|
+
def re_conn(self):
|
|
24
|
+
"""
|
|
25
|
+
重新连接
|
|
26
|
+
:return:
|
|
27
|
+
"""
|
|
28
|
+
self.basepika.connect()
|
|
29
|
+
self.basepika.create_channel()
|
|
30
|
+
self.basepika.queue_declare(queue=self.queue, durable=True)
|
|
31
|
+
self.basepika.basic_qos(self.qos)
|
|
32
|
+
|
|
33
|
+
@retry(delay=5, backoff=2, max_delay=60 * 3, logger=logging_logger)
|
|
34
|
+
def get_mq(self):
|
|
35
|
+
try:
|
|
36
|
+
if self.basepika.channel.is_closed:
|
|
37
|
+
logging_logger.info("重连中......")
|
|
38
|
+
self.re_conn()
|
|
39
|
+
logging_logger.info("重连完成......")
|
|
40
|
+
self.basepika.set_get_msg_callback(routing_key=self.queue, callback=self.callback, auto_ack=False)
|
|
41
|
+
self.basepika.start_get_msg()
|
|
42
|
+
except:
|
|
43
|
+
traceback.print_exc()
|
|
44
|
+
logging_logger.info("重连中......")
|
|
45
|
+
self.re_conn()
|
|
46
|
+
|
|
47
|
+
def callback(self, ch, method, properties, body):
|
|
48
|
+
# print(type(body))
|
|
49
|
+
# print(" [x] Received %r" % body)
|
|
50
|
+
# body = body.decode()
|
|
51
|
+
self.callback2(ch, method, properties, body)
|
|
52
|
+
if self.basepika.auto_ack is False:
|
|
53
|
+
self.basepika.basic_ack(ch, method)
|
|
54
|
+
|
|
55
|
+
def callback2(self, ch, method, properties, body):
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
@retry(delay=5, backoff=2, max_delay=60 * 3, logger=logging_logger)
|
|
59
|
+
def send_mq(self, body, num=100):
|
|
60
|
+
try:
|
|
61
|
+
if self.basepika.get_queue_size(self.queue) < num:
|
|
62
|
+
self.basepika.easy_send_msg(routing_key=self.queue,
|
|
63
|
+
body=body,
|
|
64
|
+
properties=self.properties)
|
|
65
|
+
return True
|
|
66
|
+
else:
|
|
67
|
+
return False
|
|
68
|
+
except:
|
|
69
|
+
traceback.print_exc()
|
|
70
|
+
logging_logger.info("重连中......")
|
|
71
|
+
self.re_conn()
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
def get_server_mq_num(self, num=100):
|
|
75
|
+
if self.basepika.get_queue_size(self.queue) < num:
|
|
76
|
+
return True
|
|
77
|
+
else:
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
def easy_send_mq(self, body):
|
|
81
|
+
self.basepika.easy_send_msg(routing_key=self.queue,
|
|
82
|
+
body=body,
|
|
83
|
+
properties=self.properties)
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import re
|
|
2
2
|
|
|
3
|
+
import unicodedata
|
|
4
|
+
|
|
3
5
|
|
|
4
6
|
def is_all_english_chars(s):
|
|
5
7
|
return bool(re.match(r'^[A-Za-z]+$', s))
|
|
@@ -29,7 +31,7 @@ def is_empty(value):
|
|
|
29
31
|
import pandas as pd
|
|
30
32
|
if pd.isna(value):
|
|
31
33
|
return True
|
|
32
|
-
except
|
|
34
|
+
except:
|
|
33
35
|
pass # 如果没有安装 pandas,跳过
|
|
34
36
|
|
|
35
37
|
# 如果是字符串,检查去除空白后是否为空
|
|
@@ -88,3 +90,13 @@ def is_single_cjk_char(char):
|
|
|
88
90
|
if start <= code_point <= end:
|
|
89
91
|
return True
|
|
90
92
|
return False
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def is_all_symbols(text):
|
|
96
|
+
# 是否全是符号
|
|
97
|
+
# 如果字符串为空,返回 False
|
|
98
|
+
if not text:
|
|
99
|
+
return False
|
|
100
|
+
|
|
101
|
+
# 检查每个字符是否属于符号类别
|
|
102
|
+
return all(unicodedata.category(char).startswith(('P', 'S')) for char in text)
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
|
+
from urllib.parse import unquote
|
|
3
|
+
|
|
2
4
|
import regex
|
|
3
5
|
|
|
4
6
|
from re_common.v2.baselibrary.utils.stringutils import qj2bj, bj2qj, get_diacritic_variant, clean_html, \
|
|
@@ -116,6 +118,13 @@ class StringClear(object):
|
|
|
116
118
|
self.obj_str = remove_spaces_between_chinese_characters(self.obj_str)
|
|
117
119
|
return self
|
|
118
120
|
|
|
121
|
+
def url_to_str(self):
|
|
122
|
+
"""
|
|
123
|
+
url 编码转字符
|
|
124
|
+
"""
|
|
125
|
+
self.obj_str = unquote(self.obj_str)
|
|
126
|
+
return self
|
|
127
|
+
|
|
119
128
|
def get_str(self):
|
|
120
129
|
return self.obj_str
|
|
121
130
|
|
|
@@ -163,26 +163,33 @@ re_common/studio/streamlitstudio/first_app.py,sha256=t7Fw8YDlub7G9q99GgVo_3sPZXU
|
|
|
163
163
|
re_common/studio/streamlitstudio/uber_pickups.py,sha256=cvrV5e8vRBM2_CpVDBE-f3V4mGFK9SqpRPZK8TEqr6U,785
|
|
164
164
|
re_common/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
165
165
|
re_common/v2/baselibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
166
|
+
re_common/v2/baselibrary/decorators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
167
|
+
re_common/v2/baselibrary/decorators/utils.py,sha256=Q4D6KKCQxvNBXZkPQQn14keKKJpGtg8TUSakjJU40s0,2056
|
|
166
168
|
re_common/v2/baselibrary/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
167
169
|
re_common/v2/baselibrary/s3object/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
168
170
|
re_common/v2/baselibrary/s3object/baseboto3.py,sha256=mXuIFx99pnrPGQ4LJCZwlN1HLbaU-OWLwck0cVzW6hc,11203
|
|
169
171
|
re_common/v2/baselibrary/tools/WeChatRobot.py,sha256=EaQgNncROAhU5-psYRGWAshIV5aEw-p2u1kYLpvr7RA,2796
|
|
170
172
|
re_common/v2/baselibrary/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
173
|
+
re_common/v2/baselibrary/tools/ac_ahocorasick.py,sha256=c63y5RtKVLD37nyPCnBqfNygwRj4gTQqyIdDOrC65G0,2847
|
|
171
174
|
re_common/v2/baselibrary/tools/dict_tools.py,sha256=BTh7oJuJ619IZgxiYlim0ltrXBclDtb7WzyFGr7wVf0,1246
|
|
172
175
|
re_common/v2/baselibrary/tools/dolphinscheduler.py,sha256=1m7UGYDiuvJUCI6ik6CGM2fO8U5XteJzn55VRbwB9ts,7978
|
|
176
|
+
re_common/v2/baselibrary/tools/hdfs_data_processer.py,sha256=cChy6vhK8uSVIf3bRMGWpjociIbkiV-0j29WlZqQXHM,14207
|
|
173
177
|
re_common/v2/baselibrary/tools/list_tools.py,sha256=qYxdLccRbrULOBbaPdJ_MyFFmVJGVMdW5E36nJ3ejr8,249
|
|
174
|
-
re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=
|
|
175
|
-
re_common/v2/baselibrary/tools/text_matcher.py,sha256=
|
|
178
|
+
re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=2ENLtZE8opRsfkwRtTNMzITmpTsjO7wZ1ZkfkqpOH9U,1937
|
|
179
|
+
re_common/v2/baselibrary/tools/text_matcher.py,sha256=cPMoFxaA0-ce3tLRxVSs8_3pTYS1oVIHDnNy_AlPU-4,10756
|
|
176
180
|
re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
|
|
177
|
-
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=
|
|
181
|
+
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=qY6bWcucZIU7e4yiD5-x46iCdp4HFNg_32utsysCKkc,6322
|
|
178
182
|
re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
179
183
|
re_common/v2/baselibrary/utils/author_smi.py,sha256=1ebH3AHv19jtJWdlqNdwu6t58HNVLCotuCB6ff1SWiw,13666
|
|
180
184
|
re_common/v2/baselibrary/utils/basedict.py,sha256=sH3_RZ8u4649-jX2V1uKNNkjJVUijZBDp6SdqncOZ88,1583
|
|
181
185
|
re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
|
|
186
|
+
re_common/v2/baselibrary/utils/basepika.py,sha256=ifOb3UsGj79k40aD9UK6-5BMPw43ZAo0SO3AYD4q4vw,7332
|
|
187
|
+
re_common/v2/baselibrary/utils/db.py,sha256=6HfmQHAtDm-pFFoe-ouNQggkfGRdN8Do2pN4B0ev_WU,1204
|
|
182
188
|
re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
|
|
189
|
+
re_common/v2/baselibrary/utils/mq.py,sha256=UHpO8iNIHs91Tgp-BgnSUpZwjWquxrGLdpr3FMMv2zw,2858
|
|
183
190
|
re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
|
|
184
|
-
re_common/v2/baselibrary/utils/string_bool.py,sha256=
|
|
185
|
-
re_common/v2/baselibrary/utils/string_clear.py,sha256=
|
|
191
|
+
re_common/v2/baselibrary/utils/string_bool.py,sha256=0JxzftuL61UAF-2Vp9F1Og8kXp_y647KJC5jXus9QwM,3278
|
|
192
|
+
re_common/v2/baselibrary/utils/string_clear.py,sha256=1QAb_IC8FoVL5KzXhPicz4stsYD7LyASh5sXaXfs084,6445
|
|
186
193
|
re_common/v2/baselibrary/utils/string_smi.py,sha256=cU0WAWHRGnGoVQx3eCEKeM_q_olFNzRTJe7rSe586SY,741
|
|
187
194
|
re_common/v2/baselibrary/utils/stringutils.py,sha256=WuxhXJVU6xuGfgHiSjxrn7Go1eobpa8DMR3Icoey4vo,6039
|
|
188
195
|
re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -211,8 +218,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
|
|
|
211
218
|
re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
|
|
212
219
|
re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
|
|
213
220
|
re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
|
|
214
|
-
re_common-10.0.
|
|
215
|
-
re_common-10.0.
|
|
216
|
-
re_common-10.0.
|
|
217
|
-
re_common-10.0.
|
|
218
|
-
re_common-10.0.
|
|
221
|
+
re_common-10.0.15.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
222
|
+
re_common-10.0.15.dist-info/METADATA,sha256=IhfGSUxRXpHVDZv-ZwqvSxD6yiI_WbeDEHVDpn-RvyU,582
|
|
223
|
+
re_common-10.0.15.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
224
|
+
re_common-10.0.15.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
|
|
225
|
+
re_common-10.0.15.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|