re-common 10.0.13__py3-none-any.whl → 10.0.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/v2/baselibrary/decorators/__init__.py +0 -0
- re_common/v2/baselibrary/decorators/utils.py +59 -0
- re_common/v2/baselibrary/tools/ac_ahocorasick.py +76 -0
- re_common/v2/baselibrary/tools/search_hash_tools.py +4 -3
- re_common/v2/baselibrary/tools/text_matcher.py +131 -28
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +21 -7
- re_common/v2/baselibrary/utils/basepika.py +180 -0
- re_common/v2/baselibrary/utils/mq.py +83 -0
- re_common/v2/baselibrary/utils/string_bool.py +12 -0
- re_common/v2/baselibrary/utils/string_clear.py +9 -0
- {re_common-10.0.13.dist-info → re_common-10.0.14.dist-info}/METADATA +1 -1
- {re_common-10.0.13.dist-info → re_common-10.0.14.dist-info}/RECORD +15 -10
- {re_common-10.0.13.dist-info → re_common-10.0.14.dist-info}/LICENSE +0 -0
- {re_common-10.0.13.dist-info → re_common-10.0.14.dist-info}/WHEEL +0 -0
- {re_common-10.0.13.dist-info → re_common-10.0.14.dist-info}/top_level.txt +0 -0
|
File without changes
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
import functools
|
|
3
|
+
|
|
4
|
+
# 全局集合,用于记录已警告的函数或类
|
|
5
|
+
_warned_once = set()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def deprecated(message=None):
|
|
9
|
+
"""
|
|
10
|
+
装饰器:标记函数或类为已废弃,整个进程只发出一次警告。
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
message (str): 自定义警告信息,默认为 None。
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def decorator(obj):
|
|
17
|
+
# 如果是函数
|
|
18
|
+
if isinstance(obj, type(lambda: None)):
|
|
19
|
+
@functools.wraps(obj)
|
|
20
|
+
def wrapper(*args, **kwargs):
|
|
21
|
+
obj_id = id(obj) # 使用对象的内存地址作为唯一标识
|
|
22
|
+
if obj_id not in _warned_once:
|
|
23
|
+
default_msg = f"函数 {obj.__name__} 已不建议使用。"
|
|
24
|
+
warn_msg = f"{default_msg} {message}" if message else default_msg
|
|
25
|
+
warnings.warn(
|
|
26
|
+
warn_msg,
|
|
27
|
+
category=DeprecationWarning,
|
|
28
|
+
stacklevel=2
|
|
29
|
+
)
|
|
30
|
+
_warned_once.add(obj_id) # 记录已警告
|
|
31
|
+
return obj(*args, **kwargs)
|
|
32
|
+
|
|
33
|
+
return wrapper
|
|
34
|
+
|
|
35
|
+
# 如果是类
|
|
36
|
+
elif isinstance(obj, type):
|
|
37
|
+
orig_init = obj.__init__
|
|
38
|
+
|
|
39
|
+
@functools.wraps(orig_init)
|
|
40
|
+
def new_init(self, *args, **kwargs):
|
|
41
|
+
obj_id = id(obj)
|
|
42
|
+
if obj_id not in _warned_once:
|
|
43
|
+
default_msg = f"类 {obj.__name__} 已不建议使用。"
|
|
44
|
+
warn_msg = f"{default_msg} {message}" if message else default_msg
|
|
45
|
+
warnings.warn(
|
|
46
|
+
warn_msg,
|
|
47
|
+
category=DeprecationWarning,
|
|
48
|
+
stacklevel=2
|
|
49
|
+
)
|
|
50
|
+
_warned_once.add(obj_id) # 记录已警告
|
|
51
|
+
orig_init(self, *args, **kwargs)
|
|
52
|
+
|
|
53
|
+
obj.__init__ = new_init
|
|
54
|
+
return obj
|
|
55
|
+
|
|
56
|
+
else:
|
|
57
|
+
raise TypeError("此装饰器仅适用于函数和类")
|
|
58
|
+
|
|
59
|
+
return decorator
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
|
|
3
|
+
import ahocorasick
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ACTool(object):
|
|
7
|
+
|
|
8
|
+
def __init__(self):
|
|
9
|
+
self.automaton = ahocorasick.Automaton()
|
|
10
|
+
|
|
11
|
+
def add_word(self, key, value, overwrite=True) -> bool:
|
|
12
|
+
"""
|
|
13
|
+
为 AC 机添加数据,默认情况下 key重复直接覆盖
|
|
14
|
+
:param key: 要添加的关键字
|
|
15
|
+
:param value: 对应的值
|
|
16
|
+
:param overwrite: 是否覆盖已有的 key,默认为 True
|
|
17
|
+
:return: 是否成功添加或覆盖
|
|
18
|
+
"""
|
|
19
|
+
if key in self.automaton: # 检查 key 是否已存在
|
|
20
|
+
if overwrite: # 如果允许覆盖
|
|
21
|
+
self.automaton.add_word(key, value)
|
|
22
|
+
return True
|
|
23
|
+
else: # 不允许覆盖,跳过
|
|
24
|
+
return False
|
|
25
|
+
else: # key 不存在,直接添加
|
|
26
|
+
self.automaton.add_word(key, value)
|
|
27
|
+
return True
|
|
28
|
+
|
|
29
|
+
def is_exists_key(self, key) -> bool:
|
|
30
|
+
# 是否存在key
|
|
31
|
+
if self.automaton.exists(key):
|
|
32
|
+
return True
|
|
33
|
+
else:
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
def make_automaton(self):
|
|
37
|
+
"""
|
|
38
|
+
添加完词后需要构建
|
|
39
|
+
"""
|
|
40
|
+
self.automaton.make_automaton()
|
|
41
|
+
|
|
42
|
+
def iter(self, key):
|
|
43
|
+
"""
|
|
44
|
+
结果为可迭代对象 可通过list 转换 [(end_index, value)]
|
|
45
|
+
tool.add_word("he", "word1")
|
|
46
|
+
tool.add_word("hello", "word2")
|
|
47
|
+
|
|
48
|
+
# 在字符串中查找匹配
|
|
49
|
+
input_string = "hello world"
|
|
50
|
+
matches = list(tool.automaton.iter(input_string))
|
|
51
|
+
print(matches) # [(1, 'word1'), (4, 'word2')]
|
|
52
|
+
|
|
53
|
+
(1, 'word1'):
|
|
54
|
+
end_index = 1: 表示匹配的关键字 "he" 在 input_string = "hello world" 中的结束位置是索引 1(即字符串 "he" 的最后一个字符 'e' 的位置)。
|
|
55
|
+
"hello world" 的索引:h(0)e(1)l(2)l(3)o(4) (5)w(6)o(7)r(8)l(9)d(10)。
|
|
56
|
+
value = 'word1': 表示匹配的关键字 "he" 对应的值是 "word1"。
|
|
57
|
+
(4, 'word2'):
|
|
58
|
+
end_index = 4: 表示匹配的关键字 "hello" 在 input_string = "hello world" 中的结束位置是索引 4(即字符串 "hello" 的最后一个字符 'o' 的位置)。
|
|
59
|
+
value = 'word2': 表示匹配的关键字 "hello" 对应的值是 "word2"。
|
|
60
|
+
|
|
61
|
+
注意: 结果只会返回 value 不会返回 key,如果需要key 请将key 组合到结果中
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
result_iter = self.automaton.iter(key) # ahocorasick.AutomatonSearchIter
|
|
65
|
+
return result_iter
|
|
66
|
+
def save(self,local_temp_path):
|
|
67
|
+
"""
|
|
68
|
+
将构建好的ac自动机保存到本地
|
|
69
|
+
"""
|
|
70
|
+
self.automaton.save(local_temp_path,pickle.dumps)
|
|
71
|
+
|
|
72
|
+
def load(self,local_temp_path):
|
|
73
|
+
"""
|
|
74
|
+
加载已经构建好的ac自动机
|
|
75
|
+
"""
|
|
76
|
+
self.automaton=ahocorasick.load(local_temp_path, pickle.loads)
|
|
@@ -3,9 +3,10 @@ from typing import List
|
|
|
3
3
|
import jieba
|
|
4
4
|
from datasketch import MinHash, minhash
|
|
5
5
|
|
|
6
|
+
from re_common.v2.baselibrary.decorators.utils import deprecated
|
|
6
7
|
from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
|
|
7
8
|
|
|
8
|
-
|
|
9
|
+
@deprecated("请使用 TextMatcherV2 中的方法代替。")
|
|
9
10
|
def tokenize(text: str, stopwords=None) -> List[str]:
|
|
10
11
|
"""
|
|
11
12
|
分词并移除停用词
|
|
@@ -32,7 +33,7 @@ def tokenize(text: str, stopwords=None) -> List[str]:
|
|
|
32
33
|
words = [w for w in words if w not in stopwords and w.strip()]
|
|
33
34
|
return words
|
|
34
35
|
|
|
35
|
-
|
|
36
|
+
@deprecated("请使用 TextMatcherV2 中的方法代替。")
|
|
36
37
|
def create_minhash(words: List[str], num_perm=128) -> MinHash:
|
|
37
38
|
"""
|
|
38
39
|
为分词结果创建 MinHash
|
|
@@ -42,7 +43,7 @@ def create_minhash(words: List[str], num_perm=128) -> MinHash:
|
|
|
42
43
|
minhash.update(word.encode("utf-8"))
|
|
43
44
|
return minhash
|
|
44
45
|
|
|
45
|
-
|
|
46
|
+
@deprecated("请使用 TextMatcherV2 中的方法代替。")
|
|
46
47
|
def get_str_minhash(title):
|
|
47
48
|
from re_common.v2.baselibrary.utils.string_clear import rel_clear
|
|
48
49
|
rel_title = rel_clear(title)
|
|
@@ -1,10 +1,16 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
|
|
1
3
|
import jieba
|
|
2
4
|
import re
|
|
3
|
-
from typing import List, Dict, Tuple, Set, Optional, Union
|
|
5
|
+
from typing import List, Dict, Tuple, Set, Optional, Union, Hashable, Protocol
|
|
4
6
|
from datasketch import MinHash, MinHashLSH
|
|
5
7
|
|
|
8
|
+
from re_common.v2.baselibrary.decorators.utils import deprecated
|
|
9
|
+
from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
|
|
10
|
+
|
|
6
11
|
|
|
7
|
-
|
|
12
|
+
@deprecated("请使用 TextMatcherV2 代替。")
|
|
13
|
+
class TextMatcher(object):
|
|
8
14
|
def __init__(
|
|
9
15
|
self,
|
|
10
16
|
threshold: float = 0.5,
|
|
@@ -188,36 +194,133 @@ class TextMatcher:
|
|
|
188
194
|
self.doc_counter = 0
|
|
189
195
|
|
|
190
196
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
+
# 定义一个协议,描述“像鸭子一样”的行为
|
|
198
|
+
class TokenizeDuckLike(Protocol):
|
|
199
|
+
def get_words(self, text) -> List:
|
|
200
|
+
pass
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class JiebaTokenize(object):
|
|
204
|
+
|
|
205
|
+
def __init__(self, stopwords=None):
|
|
206
|
+
self.stopwords = stopwords
|
|
207
|
+
|
|
208
|
+
def get_words(self, text) -> List:
|
|
209
|
+
|
|
210
|
+
if self.stopwords is None:
|
|
211
|
+
stopwords = []
|
|
212
|
+
words = jieba.lcut(text)
|
|
213
|
+
|
|
214
|
+
# 统计单字符数据 长度,防止结巴分词分不了的单词 将数据分为单个字符
|
|
215
|
+
|
|
216
|
+
# 这里为什么使用函数 而不是在推导式中兼容,主要是在一些 spark中 推导式的if 条件不遵循最短路径原则会将表达式当做一个整体算子
|
|
217
|
+
def is_singel_en(i):
|
|
218
|
+
if len(i) == 1 and not is_single_cjk_char(i):
|
|
219
|
+
return True
|
|
220
|
+
return False
|
|
221
|
+
|
|
222
|
+
one_char_size = len([i for i in words if is_singel_en(i)])
|
|
223
|
+
all_size = len(words)
|
|
224
|
+
# 如果单字符个数超过一定比例 就直接用空格分词
|
|
225
|
+
if all_size != 0 and one_char_size / all_size > 0.6:
|
|
226
|
+
words = [i for i in text.split() if i.strip()]
|
|
227
|
+
|
|
228
|
+
# 过滤停用词和空字符
|
|
229
|
+
words = [w for w in words if w not in stopwords and w.strip()]
|
|
230
|
+
return words
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class TextMatcherV2:
|
|
234
|
+
|
|
235
|
+
def __init__(
|
|
236
|
+
self,
|
|
237
|
+
threshold: float = 0.5,
|
|
238
|
+
num_perm: int = 128,
|
|
239
|
+
tdk: TokenizeDuckLike = None
|
|
240
|
+
):
|
|
241
|
+
"""
|
|
242
|
+
初始化文本匹配器
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
threshold: LSH 相似度阈值
|
|
246
|
+
num_perm: MinHash 排列数
|
|
247
|
+
stopwords_path: 停用词文件路径
|
|
248
|
+
user_dict_path: 用户自定义词典路径
|
|
249
|
+
"""
|
|
250
|
+
self.threshold = threshold
|
|
251
|
+
self.num_perm = num_perm
|
|
252
|
+
self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
|
|
253
|
+
self.tdk = tdk
|
|
254
|
+
|
|
255
|
+
def add_document(self, doc_id: str, minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None):
|
|
256
|
+
if isinstance(minhash, str):
|
|
257
|
+
minhash = self.str_to_minihash(minhash, tdk)
|
|
197
258
|
|
|
198
|
-
|
|
199
|
-
doc_id = matcher.add_document(
|
|
200
|
-
"北京是中国的首都"
|
|
201
|
-
)
|
|
259
|
+
self.lsh.insert(doc_id, minhash)
|
|
202
260
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
261
|
+
def batch_add_documents(self, betch_data: Union[list, dict], tdk: TokenizeDuckLike = None):
|
|
262
|
+
def _add_document(minhash_or_str, tdk):
|
|
263
|
+
if isinstance(minhash_or_str, str):
|
|
264
|
+
minhash_or_str = self.str_to_minihash(minhash_or_str, tdk)
|
|
265
|
+
self.add_document(docid, minhash_or_str, tdk)
|
|
266
|
+
|
|
267
|
+
if isinstance(betch_data, list):
|
|
268
|
+
# 必须是可解包的2个数据的元组或list
|
|
269
|
+
for docid, minhash_or_str in betch_data:
|
|
270
|
+
_add_document(minhash_or_str, tdk)
|
|
271
|
+
elif isinstance(betch_data, dict):
|
|
272
|
+
for docid, minhash_or_str in betch_data.items():
|
|
273
|
+
_add_document(minhash_or_str, tdk)
|
|
274
|
+
else:
|
|
275
|
+
raise Exception("数据类型错误")
|
|
276
|
+
|
|
277
|
+
def find_similar(self, query_minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None) -> List[Hashable]:
|
|
278
|
+
# 使用 LSH 查找候选集
|
|
279
|
+
if isinstance(query_minhash, str):
|
|
280
|
+
query_minhash = self.str_to_minihash(query_minhash, tdk)
|
|
281
|
+
similar_docs = self.lsh.query(query_minhash)
|
|
282
|
+
return similar_docs
|
|
206
283
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
284
|
+
def create_minhash(self, words: List[str], num_perm=None) -> MinHash:
|
|
285
|
+
"""
|
|
286
|
+
为分词结果创建 MinHash
|
|
287
|
+
"""
|
|
288
|
+
if num_perm is None:
|
|
289
|
+
num_perm = self.num_perm
|
|
290
|
+
minhash = MinHash(num_perm=num_perm)
|
|
291
|
+
for word in words:
|
|
292
|
+
minhash.update(word.encode("utf-8"))
|
|
293
|
+
return minhash
|
|
210
294
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
295
|
+
def create_words(self, text: str, tdk: TokenizeDuckLike = None):
|
|
296
|
+
if tdk is None:
|
|
297
|
+
tdk = self.tdk
|
|
298
|
+
worlds = tdk.get_words(text)
|
|
299
|
+
return worlds
|
|
300
|
+
|
|
301
|
+
def str_to_minihash(self, text: str, tdk: TokenizeDuckLike = None):
|
|
302
|
+
if tdk is None:
|
|
303
|
+
tdk = self.tdk
|
|
304
|
+
words = self.create_words(text, tdk)
|
|
305
|
+
minhash = self.create_minhash(words, self.num_perm)
|
|
306
|
+
return minhash
|
|
214
307
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
308
|
+
def minhash_dumps(self, minhash) -> bytes:
|
|
309
|
+
"""
|
|
310
|
+
序列化
|
|
311
|
+
"""
|
|
312
|
+
serialized_minhash = pickle.dumps(minhash)
|
|
313
|
+
return serialized_minhash
|
|
218
314
|
|
|
219
|
-
|
|
220
|
-
|
|
315
|
+
def minhash_loads(self, serialized_minhash) -> MinHash:
|
|
316
|
+
"""
|
|
317
|
+
反序列化
|
|
318
|
+
"""
|
|
319
|
+
minhash = pickle.loads(serialized_minhash)
|
|
320
|
+
return minhash
|
|
221
321
|
|
|
222
|
-
|
|
223
|
-
|
|
322
|
+
def merge_other_minhashlsh(self, other_minhashlsh: MinHashLSH):
|
|
323
|
+
"""
|
|
324
|
+
在其他地方创建好的lsh 合并进来
|
|
325
|
+
"""
|
|
326
|
+
self.lsh.merge(other_minhashlsh)
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# 某些业务中的字符串处理 算是特定场景的工具 不算通用工具
|
|
2
2
|
import re
|
|
3
3
|
|
|
4
|
+
from re_common.v2.baselibrary.utils.string_bool import is_all_symbols
|
|
5
|
+
|
|
4
6
|
|
|
5
7
|
def clean_organ_postcode(organ):
|
|
6
8
|
"""
|
|
@@ -120,6 +122,11 @@ def deal_rel_vol(vol_str: str):
|
|
|
120
122
|
"""
|
|
121
123
|
处理 期刊融合时的卷处理逻辑
|
|
122
124
|
"""
|
|
125
|
+
|
|
126
|
+
# 如果卷是全符号 清理掉
|
|
127
|
+
if is_all_symbols(vol_str):
|
|
128
|
+
vol_str = ""
|
|
129
|
+
|
|
123
130
|
if vol_str.replace(".", "").isdigit():
|
|
124
131
|
try:
|
|
125
132
|
float_num = float(vol_str)
|
|
@@ -156,20 +163,27 @@ def deal_num_strs(input_str):
|
|
|
156
163
|
return input_str
|
|
157
164
|
|
|
158
165
|
|
|
159
|
-
def deal_num(
|
|
166
|
+
def deal_num(num_str):
|
|
160
167
|
"""
|
|
161
168
|
将 期格式化 方便 group尤其是有横杆的数据
|
|
162
169
|
该方法 为融合二次分割时使用,如果场景合适也可以用于其他地方
|
|
163
170
|
:param strs:
|
|
164
171
|
:return:
|
|
165
172
|
"""
|
|
166
|
-
|
|
167
|
-
if
|
|
168
|
-
|
|
173
|
+
# 如果期是全符号清理掉
|
|
174
|
+
if is_all_symbols(num_str):
|
|
175
|
+
num_str = ""
|
|
176
|
+
|
|
177
|
+
if num_str.lower().startswith("n "):
|
|
178
|
+
num_str = num_str.lower().replace("n ", "").strip()
|
|
179
|
+
|
|
180
|
+
num_str = num_str.replace("-", "_").replace(".", "_").upper()
|
|
181
|
+
if num_str.find("_") > -1:
|
|
182
|
+
start, end = num_str.split("_")
|
|
169
183
|
start = deal_num_strs(start)
|
|
170
184
|
end = deal_num_strs(end)
|
|
171
|
-
|
|
185
|
+
num_str = start + "_" + end
|
|
172
186
|
else:
|
|
173
|
-
|
|
187
|
+
num_str = deal_num_strs(num_str)
|
|
174
188
|
|
|
175
|
-
return
|
|
189
|
+
return num_str
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import pika
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
# https://blog.csdn.net/songfreeman/article/details/50943603
|
|
5
|
+
class BasePika(object):
|
|
6
|
+
|
|
7
|
+
def __init__(self, username=None, password=None, mqhost=None, virtual_host=None):
|
|
8
|
+
self.username = username
|
|
9
|
+
self.password = password
|
|
10
|
+
self.conn = None
|
|
11
|
+
self.host = mqhost
|
|
12
|
+
self.virtual_host = virtual_host
|
|
13
|
+
self.auto_ack = True
|
|
14
|
+
|
|
15
|
+
def set_default(self):
|
|
16
|
+
self.host = "192.168.31.79"
|
|
17
|
+
self.virtual_host = "vhost_NetDataGather"
|
|
18
|
+
self.username = "vip"
|
|
19
|
+
self.password = "piv$*123"
|
|
20
|
+
|
|
21
|
+
def connect_str(self,amqp_str):
|
|
22
|
+
parameters = pika.URLParameters(amqp_str)
|
|
23
|
+
self.conn = pika.BlockingConnection(parameters)
|
|
24
|
+
|
|
25
|
+
def connect(self):
|
|
26
|
+
"""
|
|
27
|
+
设置用户名 密码 进行连接
|
|
28
|
+
:return:
|
|
29
|
+
"""
|
|
30
|
+
credentials = pika.PlainCredentials(self.username, self.password)
|
|
31
|
+
# parameters = pika.URLParameters('amqp://guest:guest@rabbit-server1:5672/%2F')
|
|
32
|
+
# 可以通过将 heartbeat 设为 0,关闭 rabbitmq 的心跳检测
|
|
33
|
+
parameters = pika.ConnectionParameters(host=self.host,
|
|
34
|
+
virtual_host=self.virtual_host,
|
|
35
|
+
credentials=credentials,
|
|
36
|
+
heartbeat=0)
|
|
37
|
+
self.conn = pika.BlockingConnection(parameters)
|
|
38
|
+
|
|
39
|
+
def close(self):
|
|
40
|
+
# 关闭消息队列
|
|
41
|
+
self.conn.close()
|
|
42
|
+
|
|
43
|
+
def create_channel(self):
|
|
44
|
+
self.channel = self.conn.channel()
|
|
45
|
+
|
|
46
|
+
def __del__(self):
|
|
47
|
+
self.channel.close()
|
|
48
|
+
self.conn.close()
|
|
49
|
+
|
|
50
|
+
def random_queue_declare(self):
|
|
51
|
+
"""
|
|
52
|
+
这样, result.method.queue 包含一个随机的队列名, 比如:看起来像 amq.gen-JzTY20BRgKO-HjmUJj0wLg.
|
|
53
|
+
其次:
|
|
54
|
+
一旦我们断开consumer连接,这个队列名将自动删除。这里有一个标识设置:
|
|
55
|
+
:return:
|
|
56
|
+
"""
|
|
57
|
+
return self.channel.queue_declare("", exclusive=True)
|
|
58
|
+
|
|
59
|
+
def queue_declare(self, queue="hello", durable=False):
|
|
60
|
+
"""
|
|
61
|
+
创建目的地队列hello 取消息时也可以调用
|
|
62
|
+
取消息和发送消息都调用 保证队列存在,也保证了不管服务端还是客户端先启动都有队列
|
|
63
|
+
durable True 为持久化
|
|
64
|
+
:return:
|
|
65
|
+
"""
|
|
66
|
+
return self.channel.queue_declare(queue=queue, durable=durable)
|
|
67
|
+
|
|
68
|
+
def get_queue_size(self, queue="hello"):
|
|
69
|
+
"""
|
|
70
|
+
获取某个队列的长度
|
|
71
|
+
:param queue:
|
|
72
|
+
:return:
|
|
73
|
+
"""
|
|
74
|
+
queue = self.queue_declare(queue=queue, durable=True)
|
|
75
|
+
return queue.method.message_count
|
|
76
|
+
|
|
77
|
+
def get_properties(self):
|
|
78
|
+
"""
|
|
79
|
+
与 queue_declare里的 durable = True 配合使用,
|
|
80
|
+
设置给 easy_send_msg的properties
|
|
81
|
+
:return:
|
|
82
|
+
"""
|
|
83
|
+
return pika.BasicProperties(
|
|
84
|
+
delivery_mode=2, # 设置消息为持久化的
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def easy_send_msg(self, exchange="", routing_key="hello", body="hello world", properties=None):
|
|
88
|
+
"""
|
|
89
|
+
空字符串标识默认的或者匿名的exchange,如果存在routing_key, 消息路由到routing_key指定的队列中。
|
|
90
|
+
routing_key 标识发送到哪个队列,就是服务器上的队列名
|
|
91
|
+
body 发送的消息
|
|
92
|
+
|
|
93
|
+
basic_publish 如果 exchange 不是"" 但没有绑定队列 消息会消失
|
|
94
|
+
:return:
|
|
95
|
+
"""
|
|
96
|
+
self.channel.basic_publish(exchange=exchange,
|
|
97
|
+
routing_key=routing_key,
|
|
98
|
+
body=body,
|
|
99
|
+
properties=properties)
|
|
100
|
+
|
|
101
|
+
def basic_ack(self, ch, method):
|
|
102
|
+
"""
|
|
103
|
+
callback的消息确认
|
|
104
|
+
:param ch:
|
|
105
|
+
:param method:
|
|
106
|
+
:return:
|
|
107
|
+
"""
|
|
108
|
+
ch.basic_ack(delivery_tag=method.delivery_tag)
|
|
109
|
+
|
|
110
|
+
def callback(self, ch, method, properties, body):
|
|
111
|
+
"""
|
|
112
|
+
从队列接收消息要更复杂一些,它需要为队列订阅一个 callback 函数来进行接收。
|
|
113
|
+
当我们接收一个消息后,这个 callback 函数将会被 pika函数库自动调用,
|
|
114
|
+
在我们的这个实例里面这个函数将用来打印接收的消息内容到屏幕
|
|
115
|
+
:param method:
|
|
116
|
+
:param properties:
|
|
117
|
+
:param body:
|
|
118
|
+
:return:
|
|
119
|
+
"""
|
|
120
|
+
print(type(body))
|
|
121
|
+
print(" [x] Received %r" % body)
|
|
122
|
+
if self.auto_ack is False:
|
|
123
|
+
self.basic_ack(ch, method)
|
|
124
|
+
|
|
125
|
+
def set_get_msg_callback(self, routing_key="hello", callback=None, auto_ack=True):
|
|
126
|
+
"""
|
|
127
|
+
设置取消息的callback
|
|
128
|
+
no_ack 如果设置为True,将使用自动确认模式
|
|
129
|
+
no_ack 如果设置为False,在callback中确认
|
|
130
|
+
:return:
|
|
131
|
+
"""
|
|
132
|
+
self.auto_ack = auto_ack
|
|
133
|
+
if callback is None:
|
|
134
|
+
callback = self.callback
|
|
135
|
+
self.channel.basic_consume(routing_key,
|
|
136
|
+
callback,
|
|
137
|
+
auto_ack=auto_ack)
|
|
138
|
+
|
|
139
|
+
def start_get_msg(self):
|
|
140
|
+
"""
|
|
141
|
+
开始取消息,会循环不停的取消息
|
|
142
|
+
:return:
|
|
143
|
+
"""
|
|
144
|
+
self.channel.start_consuming()
|
|
145
|
+
|
|
146
|
+
def basic_qos(self, prefetch_count=1):
|
|
147
|
+
"""
|
|
148
|
+
可以提前发送几个消息来,当auto_ack=True时无效
|
|
149
|
+
prefetch_count==1 消息未处理完前不要发送信息的消息
|
|
150
|
+
:return:
|
|
151
|
+
"""
|
|
152
|
+
self.channel.basic_qos(prefetch_count=prefetch_count)
|
|
153
|
+
|
|
154
|
+
def exchange_declare(self, exchangename="logs", type="fanout"):
|
|
155
|
+
"""
|
|
156
|
+
fanout exchange非常简单,你从这个名字中就能猜出来,它将从Producer方收到的消息广播给所有他知道的receiver方。而这正是我们的logger记录所需要的消息。
|
|
157
|
+
交换的类型
|
|
158
|
+
直接交换(direct exchange)的路由算法很简单 -- 消息发送到绑定键值(binding key) 刚好完全符合路由键值( routing key) 的消息队列中。
|
|
159
|
+
|
|
160
|
+
消息发送到一个 topic交换不能是一个任意的 routing_key -- 它必须是一个用小数点 分割的单词列表。 这个字符可以是任何单词,但是通常是指定一些连接特定消息的功能。一些有效的路由键(routing key)比如:“stock.usd.nyse",
|
|
161
|
+
topic 是 直接交换的升级版
|
|
162
|
+
|
|
163
|
+
headers Exchange :headers交换器允许你匹配AMQP消息的header而非路由键。除此之外,headers交换器和direct交换器完全一致,但性能会差很多。因此它并不太实用,而且几乎再也用不到了。
|
|
164
|
+
exchangename接下来会与队列绑定
|
|
165
|
+
direct , topic , headers 和 fanout
|
|
166
|
+
:return:
|
|
167
|
+
"""
|
|
168
|
+
return self.channel.exchange_declare(exchange=exchangename,
|
|
169
|
+
exchange_type=type)
|
|
170
|
+
|
|
171
|
+
def queue_bind(self, exchange="logs", queue="", routing_key=""):
|
|
172
|
+
"""
|
|
173
|
+
queue 临时队列获取 self.random_queue_declare().method.queue
|
|
174
|
+
:param exchange:
|
|
175
|
+
:param queue:
|
|
176
|
+
:return:
|
|
177
|
+
"""
|
|
178
|
+
self.channel.queue_bind(exchange=exchange,
|
|
179
|
+
queue=queue,
|
|
180
|
+
routing_key=routing_key)
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import traceback
|
|
3
|
+
|
|
4
|
+
from re_common.v2.baselibrary.utils.basepika import BasePika
|
|
5
|
+
from retry import retry
|
|
6
|
+
|
|
7
|
+
logging_logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class UseMq(object):
|
|
11
|
+
|
|
12
|
+
def __init__(self, queue, qos=1):
|
|
13
|
+
self.queue = queue
|
|
14
|
+
self.qos = qos
|
|
15
|
+
self.basepika = BasePika()
|
|
16
|
+
self.basepika.set_default()
|
|
17
|
+
self.basepika.connect()
|
|
18
|
+
self.basepika.create_channel()
|
|
19
|
+
self.basepika.queue_declare(queue=queue, durable=True)
|
|
20
|
+
self.basepika.basic_qos(qos)
|
|
21
|
+
self.properties = self.basepika.get_properties()
|
|
22
|
+
|
|
23
|
+
def re_conn(self):
|
|
24
|
+
"""
|
|
25
|
+
重新连接
|
|
26
|
+
:return:
|
|
27
|
+
"""
|
|
28
|
+
self.basepika.connect()
|
|
29
|
+
self.basepika.create_channel()
|
|
30
|
+
self.basepika.queue_declare(queue=self.queue, durable=True)
|
|
31
|
+
self.basepika.basic_qos(self.qos)
|
|
32
|
+
|
|
33
|
+
@retry(delay=5, backoff=2, max_delay=60 * 3, logger=logging_logger)
|
|
34
|
+
def get_mq(self):
|
|
35
|
+
try:
|
|
36
|
+
if self.basepika.channel.is_closed:
|
|
37
|
+
logging_logger.info("重连中......")
|
|
38
|
+
self.re_conn()
|
|
39
|
+
logging_logger.info("重连完成......")
|
|
40
|
+
self.basepika.set_get_msg_callback(routing_key=self.queue, callback=self.callback, auto_ack=False)
|
|
41
|
+
self.basepika.start_get_msg()
|
|
42
|
+
except:
|
|
43
|
+
traceback.print_exc()
|
|
44
|
+
logging_logger.info("重连中......")
|
|
45
|
+
self.re_conn()
|
|
46
|
+
|
|
47
|
+
def callback(self, ch, method, properties, body):
|
|
48
|
+
# print(type(body))
|
|
49
|
+
# print(" [x] Received %r" % body)
|
|
50
|
+
# body = body.decode()
|
|
51
|
+
self.callback2(ch, method, properties, body)
|
|
52
|
+
if self.basepika.auto_ack is False:
|
|
53
|
+
self.basepika.basic_ack(ch, method)
|
|
54
|
+
|
|
55
|
+
def callback2(self, ch, method, properties, body):
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
@retry(delay=5, backoff=2, max_delay=60 * 3, logger=logging_logger)
|
|
59
|
+
def send_mq(self, body, num=100):
|
|
60
|
+
try:
|
|
61
|
+
if self.basepika.get_queue_size(self.queue) < num:
|
|
62
|
+
self.basepika.easy_send_msg(routing_key=self.queue,
|
|
63
|
+
body=body,
|
|
64
|
+
properties=self.properties)
|
|
65
|
+
return True
|
|
66
|
+
else:
|
|
67
|
+
return False
|
|
68
|
+
except:
|
|
69
|
+
traceback.print_exc()
|
|
70
|
+
logging_logger.info("重连中......")
|
|
71
|
+
self.re_conn()
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
def get_server_mq_num(self, num=100):
|
|
75
|
+
if self.basepika.get_queue_size(self.queue) < num:
|
|
76
|
+
return True
|
|
77
|
+
else:
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
def easy_send_mq(self, body):
|
|
81
|
+
self.basepika.easy_send_msg(routing_key=self.queue,
|
|
82
|
+
body=body,
|
|
83
|
+
properties=self.properties)
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import re
|
|
2
2
|
|
|
3
|
+
import unicodedata
|
|
4
|
+
|
|
3
5
|
|
|
4
6
|
def is_all_english_chars(s):
|
|
5
7
|
return bool(re.match(r'^[A-Za-z]+$', s))
|
|
@@ -88,3 +90,13 @@ def is_single_cjk_char(char):
|
|
|
88
90
|
if start <= code_point <= end:
|
|
89
91
|
return True
|
|
90
92
|
return False
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def is_all_symbols(text):
|
|
96
|
+
# 是否全是符号
|
|
97
|
+
# 如果字符串为空,返回 False
|
|
98
|
+
if not text:
|
|
99
|
+
return False
|
|
100
|
+
|
|
101
|
+
# 检查每个字符是否属于符号类别
|
|
102
|
+
return all(unicodedata.category(char).startswith(('P', 'S')) for char in text)
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
|
+
from urllib.parse import unquote
|
|
3
|
+
|
|
2
4
|
import regex
|
|
3
5
|
|
|
4
6
|
from re_common.v2.baselibrary.utils.stringutils import qj2bj, bj2qj, get_diacritic_variant, clean_html, \
|
|
@@ -116,6 +118,13 @@ class StringClear(object):
|
|
|
116
118
|
self.obj_str = remove_spaces_between_chinese_characters(self.obj_str)
|
|
117
119
|
return self
|
|
118
120
|
|
|
121
|
+
def url_to_str(self):
|
|
122
|
+
"""
|
|
123
|
+
url 编码转字符
|
|
124
|
+
"""
|
|
125
|
+
self.obj_str = unquote(self.obj_str)
|
|
126
|
+
return self
|
|
127
|
+
|
|
119
128
|
def get_str(self):
|
|
120
129
|
return self.obj_str
|
|
121
130
|
|
|
@@ -163,26 +163,31 @@ re_common/studio/streamlitstudio/first_app.py,sha256=t7Fw8YDlub7G9q99GgVo_3sPZXU
|
|
|
163
163
|
re_common/studio/streamlitstudio/uber_pickups.py,sha256=cvrV5e8vRBM2_CpVDBE-f3V4mGFK9SqpRPZK8TEqr6U,785
|
|
164
164
|
re_common/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
165
165
|
re_common/v2/baselibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
166
|
+
re_common/v2/baselibrary/decorators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
167
|
+
re_common/v2/baselibrary/decorators/utils.py,sha256=Q4D6KKCQxvNBXZkPQQn14keKKJpGtg8TUSakjJU40s0,2056
|
|
166
168
|
re_common/v2/baselibrary/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
167
169
|
re_common/v2/baselibrary/s3object/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
168
170
|
re_common/v2/baselibrary/s3object/baseboto3.py,sha256=mXuIFx99pnrPGQ4LJCZwlN1HLbaU-OWLwck0cVzW6hc,11203
|
|
169
171
|
re_common/v2/baselibrary/tools/WeChatRobot.py,sha256=EaQgNncROAhU5-psYRGWAshIV5aEw-p2u1kYLpvr7RA,2796
|
|
170
172
|
re_common/v2/baselibrary/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
173
|
+
re_common/v2/baselibrary/tools/ac_ahocorasick.py,sha256=c63y5RtKVLD37nyPCnBqfNygwRj4gTQqyIdDOrC65G0,2847
|
|
171
174
|
re_common/v2/baselibrary/tools/dict_tools.py,sha256=BTh7oJuJ619IZgxiYlim0ltrXBclDtb7WzyFGr7wVf0,1246
|
|
172
175
|
re_common/v2/baselibrary/tools/dolphinscheduler.py,sha256=1m7UGYDiuvJUCI6ik6CGM2fO8U5XteJzn55VRbwB9ts,7978
|
|
173
176
|
re_common/v2/baselibrary/tools/list_tools.py,sha256=qYxdLccRbrULOBbaPdJ_MyFFmVJGVMdW5E36nJ3ejr8,249
|
|
174
|
-
re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=
|
|
175
|
-
re_common/v2/baselibrary/tools/text_matcher.py,sha256=
|
|
177
|
+
re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=2ENLtZE8opRsfkwRtTNMzITmpTsjO7wZ1ZkfkqpOH9U,1937
|
|
178
|
+
re_common/v2/baselibrary/tools/text_matcher.py,sha256=C_3RDJo0ev6xTgj9EsJLmDAe01k443I6aCeXyMCpcBY,10748
|
|
176
179
|
re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
|
|
177
|
-
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=
|
|
180
|
+
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=qY6bWcucZIU7e4yiD5-x46iCdp4HFNg_32utsysCKkc,6322
|
|
178
181
|
re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
179
182
|
re_common/v2/baselibrary/utils/author_smi.py,sha256=1ebH3AHv19jtJWdlqNdwu6t58HNVLCotuCB6ff1SWiw,13666
|
|
180
183
|
re_common/v2/baselibrary/utils/basedict.py,sha256=sH3_RZ8u4649-jX2V1uKNNkjJVUijZBDp6SdqncOZ88,1583
|
|
181
184
|
re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
|
|
185
|
+
re_common/v2/baselibrary/utils/basepika.py,sha256=ifOb3UsGj79k40aD9UK6-5BMPw43ZAo0SO3AYD4q4vw,7332
|
|
182
186
|
re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
|
|
187
|
+
re_common/v2/baselibrary/utils/mq.py,sha256=UHpO8iNIHs91Tgp-BgnSUpZwjWquxrGLdpr3FMMv2zw,2858
|
|
183
188
|
re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
|
|
184
|
-
re_common/v2/baselibrary/utils/string_bool.py,sha256=
|
|
185
|
-
re_common/v2/baselibrary/utils/string_clear.py,sha256=
|
|
189
|
+
re_common/v2/baselibrary/utils/string_bool.py,sha256=YzefIhHqAjUL9ov8YyAoTLivMJyi10rkBFiLjwKju20,3290
|
|
190
|
+
re_common/v2/baselibrary/utils/string_clear.py,sha256=1QAb_IC8FoVL5KzXhPicz4stsYD7LyASh5sXaXfs084,6445
|
|
186
191
|
re_common/v2/baselibrary/utils/string_smi.py,sha256=cU0WAWHRGnGoVQx3eCEKeM_q_olFNzRTJe7rSe586SY,741
|
|
187
192
|
re_common/v2/baselibrary/utils/stringutils.py,sha256=WuxhXJVU6xuGfgHiSjxrn7Go1eobpa8DMR3Icoey4vo,6039
|
|
188
193
|
re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -211,8 +216,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
|
|
|
211
216
|
re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
|
|
212
217
|
re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
|
|
213
218
|
re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
|
|
214
|
-
re_common-10.0.
|
|
215
|
-
re_common-10.0.
|
|
216
|
-
re_common-10.0.
|
|
217
|
-
re_common-10.0.
|
|
218
|
-
re_common-10.0.
|
|
219
|
+
re_common-10.0.14.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
220
|
+
re_common-10.0.14.dist-info/METADATA,sha256=41i895Vwb1TJpI_7Kf_fWDmadz6YJnRHWnOj-iz7Tm0,582
|
|
221
|
+
re_common-10.0.14.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
222
|
+
re_common-10.0.14.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
|
|
223
|
+
re_common-10.0.14.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|