re-common 10.0.13__py3-none-any.whl → 10.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,59 @@
1
+ import warnings
2
+ import functools
3
+
4
+ # 全局集合,用于记录已警告的函数或类
5
+ _warned_once = set()
6
+
7
+
8
+ def deprecated(message=None):
9
+ """
10
+ 装饰器:标记函数或类为已废弃,整个进程只发出一次警告。
11
+
12
+ Args:
13
+ message (str): 自定义警告信息,默认为 None。
14
+ """
15
+
16
+ def decorator(obj):
17
+ # 如果是函数
18
+ if isinstance(obj, type(lambda: None)):
19
+ @functools.wraps(obj)
20
+ def wrapper(*args, **kwargs):
21
+ obj_id = id(obj) # 使用对象的内存地址作为唯一标识
22
+ if obj_id not in _warned_once:
23
+ default_msg = f"函数 {obj.__name__} 已不建议使用。"
24
+ warn_msg = f"{default_msg} {message}" if message else default_msg
25
+ warnings.warn(
26
+ warn_msg,
27
+ category=DeprecationWarning,
28
+ stacklevel=2
29
+ )
30
+ _warned_once.add(obj_id) # 记录已警告
31
+ return obj(*args, **kwargs)
32
+
33
+ return wrapper
34
+
35
+ # 如果是类
36
+ elif isinstance(obj, type):
37
+ orig_init = obj.__init__
38
+
39
+ @functools.wraps(orig_init)
40
+ def new_init(self, *args, **kwargs):
41
+ obj_id = id(obj)
42
+ if obj_id not in _warned_once:
43
+ default_msg = f"类 {obj.__name__} 已不建议使用。"
44
+ warn_msg = f"{default_msg} {message}" if message else default_msg
45
+ warnings.warn(
46
+ warn_msg,
47
+ category=DeprecationWarning,
48
+ stacklevel=2
49
+ )
50
+ _warned_once.add(obj_id) # 记录已警告
51
+ orig_init(self, *args, **kwargs)
52
+
53
+ obj.__init__ = new_init
54
+ return obj
55
+
56
+ else:
57
+ raise TypeError("此装饰器仅适用于函数和类")
58
+
59
+ return decorator
@@ -0,0 +1,76 @@
1
+ import pickle
2
+
3
+ import ahocorasick
4
+
5
+
6
+ class ACTool(object):
7
+
8
+ def __init__(self):
9
+ self.automaton = ahocorasick.Automaton()
10
+
11
+ def add_word(self, key, value, overwrite=True) -> bool:
12
+ """
13
+ 为 AC 机添加数据,默认情况下 key重复直接覆盖
14
+ :param key: 要添加的关键字
15
+ :param value: 对应的值
16
+ :param overwrite: 是否覆盖已有的 key,默认为 True
17
+ :return: 是否成功添加或覆盖
18
+ """
19
+ if key in self.automaton: # 检查 key 是否已存在
20
+ if overwrite: # 如果允许覆盖
21
+ self.automaton.add_word(key, value)
22
+ return True
23
+ else: # 不允许覆盖,跳过
24
+ return False
25
+ else: # key 不存在,直接添加
26
+ self.automaton.add_word(key, value)
27
+ return True
28
+
29
+ def is_exists_key(self, key) -> bool:
30
+ # 是否存在key
31
+ if self.automaton.exists(key):
32
+ return True
33
+ else:
34
+ return False
35
+
36
+ def make_automaton(self):
37
+ """
38
+ 添加完词后需要构建
39
+ """
40
+ self.automaton.make_automaton()
41
+
42
+ def iter(self, key):
43
+ """
44
+ 结果为可迭代对象 可通过list 转换 [(end_index, value)]
45
+ tool.add_word("he", "word1")
46
+ tool.add_word("hello", "word2")
47
+
48
+ # 在字符串中查找匹配
49
+ input_string = "hello world"
50
+ matches = list(tool.automaton.iter(input_string))
51
+ print(matches) # [(1, 'word1'), (4, 'word2')]
52
+
53
+ (1, 'word1'):
54
+ end_index = 1: 表示匹配的关键字 "he" 在 input_string = "hello world" 中的结束位置是索引 1(即字符串 "he" 的最后一个字符 'e' 的位置)。
55
+ "hello world" 的索引:h(0)e(1)l(2)l(3)o(4) (5)w(6)o(7)r(8)l(9)d(10)。
56
+ value = 'word1': 表示匹配的关键字 "he" 对应的值是 "word1"。
57
+ (4, 'word2'):
58
+ end_index = 4: 表示匹配的关键字 "hello" 在 input_string = "hello world" 中的结束位置是索引 4(即字符串 "hello" 的最后一个字符 'o' 的位置)。
59
+ value = 'word2': 表示匹配的关键字 "hello" 对应的值是 "word2"。
60
+
61
+ 注意: 结果只会返回 value 不会返回 key,如果需要key 请将key 组合到结果中
62
+ """
63
+
64
+ result_iter = self.automaton.iter(key) # ahocorasick.AutomatonSearchIter
65
+ return result_iter
66
+ def save(self,local_temp_path):
67
+ """
68
+ 将构建好的ac自动机保存到本地
69
+ """
70
+ self.automaton.save(local_temp_path,pickle.dumps)
71
+
72
+ def load(self,local_temp_path):
73
+ """
74
+ 加载已经构建好的ac自动机
75
+ """
76
+ self.automaton=ahocorasick.load(local_temp_path, pickle.loads)
@@ -0,0 +1,318 @@
1
+ import asyncio
2
+ import gzip
3
+ import json
4
+ import sqlite3
5
+ import time
6
+ import os
7
+ from io import BytesIO
8
+ from typing import Callable, Any, List
9
+
10
+ from hdfs import InsecureClient
11
+
12
+
13
+ class HDFSDataProcessor:
14
+ def __init__(
15
+ self,
16
+ hdfs_url="http://VIP-DC-MASTER-2:9870",
17
+ hdfs_user="root",
18
+ db_file="processed_files.db",
19
+ batch_size=50,
20
+ retry_limit=3,
21
+ ):
22
+ self.hdfs_url = hdfs_url
23
+ self.hdfs_user = hdfs_user
24
+ self.db_file = db_file
25
+ self.batch_size = batch_size
26
+ self.retry_limit = retry_limit
27
+ self.client = InsecureClient(self.hdfs_url, user=self.hdfs_user)
28
+ self.read_hdfs_fanc = {"all": self.all_read_gz, "batch": self.batch_read_gz}
29
+ self.read_hdfs_model = "all"
30
+ self.init_db()
31
+
32
+ def init_db(self):
33
+ """初始化 SQLite 数据库"""
34
+ with sqlite3.connect(self.db_file) as conn:
35
+ cursor = conn.cursor()
36
+ cursor.execute("""
37
+ CREATE TABLE IF NOT EXISTS processed_files (
38
+ file_path TEXT PRIMARY KEY
39
+ )
40
+ """)
41
+ conn.commit()
42
+
43
+ def save_processed_file(self, file_path):
44
+ """保存处理过的文件"""
45
+ with sqlite3.connect(self.db_file) as conn:
46
+ cursor = conn.cursor()
47
+ cursor.execute("INSERT OR IGNORE INTO processed_files (file_path) VALUES (?)", (file_path,))
48
+ conn.commit()
49
+
50
+ def is_file_processed(self, file_path):
51
+ """检查文件是否已处理"""
52
+ with sqlite3.connect(self.db_file) as conn:
53
+ cursor = conn.cursor()
54
+ cursor.execute("SELECT file_path FROM processed_files WHERE file_path = ?", (file_path,))
55
+ result = cursor.fetchone()
56
+ return result is not None
57
+
58
+ def list_gz_files(self, hdfs_dir):
59
+ """列出 HDFS 目录中的所有 gzip 文件"""
60
+ return [f"{hdfs_dir}/{file[0]}" for file in self.client.list(hdfs_dir, status=True) if file[0].endswith(".gz")]
61
+
62
+ def count_total_lines(self, gz_file_path: str):
63
+ with self.client.read(gz_file_path) as hdfs_file:
64
+ with gzip.GzipFile(fileobj=hdfs_file) as gz:
65
+ return sum(1 for _ in gz)
66
+
67
+ def batch_read_gz(self, gz_file_path: str):
68
+ """分批读取 gz 文件"""
69
+ with self.client.read(gz_file_path) as hdfs_file:
70
+ with gzip.GzipFile(fileobj=hdfs_file) as gz:
71
+ while True:
72
+ lines = []
73
+ for _ in range(self.batch_size):
74
+ try:
75
+ line = next(gz)
76
+ if line.strip(): # 移除空行
77
+ lines.append(line.decode("utf-8")) # 解码
78
+ except StopIteration: # 文件已读完
79
+ break
80
+ if not lines:
81
+ break
82
+ yield lines
83
+
84
+ def all_read_gz(self, gz_file_path: str, encoding='utf-8'):
85
+ """
86
+ 读取 HDFS 上的 .gz 文件内容。
87
+ :param hdfs_path: HDFS 文件路径(必须以 .gz 结尾)
88
+ :param encoding: 文件编码格式(默认 utf-8)
89
+ :return: 文件内容
90
+ """
91
+ with self.client.read(gz_file_path) as reader: # 以二进制模式读取
92
+ compressed_data = reader.read() # 读取压缩数据
93
+ with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file: # 解压缩
94
+ content = gz_file.read().decode(encoding) # 解码为字符串
95
+ print(f"文件读取成功: {gz_file_path}")
96
+ lines = [i for i in content.splitlines() if i.strip()]
97
+ result = [lines[i:i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
98
+ return result
99
+
100
+ async def process_data(self, data, process_func):
101
+ """处理数据并执行处理函数"""
102
+ retry_count = 0
103
+ while retry_count < self.retry_limit:
104
+ try:
105
+ await process_func(data)
106
+ return # 成功处理后退出
107
+ except Exception as e:
108
+ retry_count += 1
109
+ print(f"处理数据时发生错误: {e}, 正在重试 {retry_count}/{self.retry_limit}, data: {data}")
110
+ await asyncio.sleep(2 ** retry_count)
111
+ print(f"处理数据失败, 达到重试上限, data: {data}")
112
+
113
+ async def process_file(self, hdfs_file_path, process_func):
114
+ """处理单个 gz 文件"""
115
+ total_lines = self.count_total_lines(hdfs_file_path)
116
+ processed_lines = 0
117
+ start_time = time.time()
118
+ # # 这里根据不同的配置选用不同的读取文件的方法
119
+ for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
120
+ processing_start_time = time.time() # 记录本批处理开始时间
121
+
122
+ tasks = []
123
+ for line in lines:
124
+ try:
125
+ data = json.loads(line)
126
+ tasks.append(self.process_data(data, process_func))
127
+ except json.JSONDecodeError as e:
128
+ print(f"解析JSON失败: {e}, 行内容: {line.strip()}")
129
+
130
+ # await AsyncTaskPool(self.batch_size).run(tasks) # AsyncTaskPool 适用于一次提交所有任务, 限制并发数执行
131
+ await asyncio.gather(*tasks)
132
+
133
+ processed_lines += len(lines)
134
+
135
+ elapsed_time = time.time() - start_time # 已用时间
136
+ processing_time = time.time() - processing_start_time # 本次处理时间
137
+ avg_processing_time = (
138
+ (elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
139
+ ) # 平均每条数据的处理时间(毫秒)
140
+
141
+ # 估算剩余时间
142
+ remaining_time = (
143
+ ((avg_processing_time / 1000) * (total_lines - processed_lines))
144
+ if processed_lines > 0
145
+ else float("inf")
146
+ )
147
+
148
+ # 显示总进度信息
149
+ print(
150
+ f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
151
+ f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
152
+ f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
153
+ )
154
+
155
+ # 最终进度显示
156
+ final_elapsed_time = time.time() - start_time # 最终已用时间
157
+ print(
158
+ f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
159
+ f"总已用时间: {final_elapsed_time:.2f}秒 | "
160
+ f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
161
+ if processed_lines > 0
162
+ else "处理无数据"
163
+ )
164
+
165
+ self.save_processed_file(hdfs_file_path) # 保存处理过的文件
166
+
167
+ async def retry_process_file(self, hdfs_file_path, process_func):
168
+ """带重试机制的文件处理"""
169
+ retry_count = 0
170
+ while retry_count < self.retry_limit:
171
+ try:
172
+ await self.process_file(hdfs_file_path, process_func)
173
+ return True # 成功处理后退出
174
+ except Exception as e:
175
+ retry_count += 1
176
+ print(f"处理文件 {hdfs_file_path} 时发生错误: {e},正在重试 {retry_count}/{self.retry_limit}")
177
+ await asyncio.sleep(2 ** retry_count)
178
+ print(f"处理文件 {hdfs_file_path} 失败,达到重试上限")
179
+ return False
180
+ # raise
181
+
182
+ async def batch_process_file(self, hdfs_dir: str, process_func: Callable[[dict], Any]):
183
+ """批量更新所有 gz 文件"""
184
+ gz_files = self.list_gz_files(hdfs_dir)
185
+ all_succeed = True
186
+ for hdfs_file_path in gz_files:
187
+ if self.is_file_processed(hdfs_file_path):
188
+ print(f"跳过已处理文件: {hdfs_file_path}")
189
+ continue # 如果文件已处理,跳过
190
+ succeed = await self.retry_process_file(hdfs_file_path, process_func) # 处理文件
191
+ if succeed is False:
192
+ all_succeed = False
193
+
194
+ if all_succeed:
195
+ # 处理完成后删除数据库文件
196
+ try:
197
+ if os.path.exists(self.db_file):
198
+ os.remove(self.db_file)
199
+ print(f"已删除断点重试文件: {self.db_file}")
200
+ except Exception as e:
201
+ print(f"删除断点重试文件失败: {e}")
202
+
203
+ async def process_file_bulk(self, hdfs_file_path, process_func):
204
+ """按批次处理单个文件,批量数据传递给处理函数"""
205
+ total_lines = self.count_total_lines(hdfs_file_path)
206
+ processed_lines = 0
207
+ start_time = time.time()
208
+
209
+ tasks = []
210
+ # 这里根据不同的配置选用不同的读取文件的方法
211
+ for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
212
+ processing_start_time = time.time() # 记录本批处理开始时间
213
+
214
+ batch_data = []
215
+ for line in lines:
216
+ try:
217
+ data = json.loads(line)
218
+ batch_data.append(data)
219
+ except json.JSONDecodeError as e:
220
+ print(f"解析JSON失败: {e}, 行内容: {line.strip()}")
221
+
222
+ # 处理读取到的批次数据
223
+ if batch_data:
224
+ tasks.append(process_func(batch_data)) # 将批次数据传递给处理函数并收集任务
225
+ processed_lines += len(batch_data) # 更新已处理行数
226
+
227
+ # 当积累的任务数量达到 batch_size 时并发处理所有任务
228
+ if len(tasks) >= self.batch_size:
229
+ await asyncio.gather(*tasks) # 同时处理多个批次
230
+
231
+ elapsed_time = time.time() - start_time # 已用时间
232
+ processing_time = time.time() - processing_start_time # 本次处理时间
233
+ avg_processing_time = (
234
+ (elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
235
+ ) # 平均每条数据的处理时间(毫秒)
236
+
237
+ # 估算剩余时间
238
+ remaining_time = (
239
+ ((avg_processing_time / 1000) * (total_lines - processed_lines))
240
+ if processed_lines > 0
241
+ else float("inf")
242
+ )
243
+
244
+ # 显示总进度信息
245
+ print(
246
+ f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
247
+ f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
248
+ f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
249
+ )
250
+
251
+ # 清空任务列表,准备下一批处理
252
+ tasks.clear()
253
+ # 处理剩余的任务
254
+ if tasks:
255
+ await asyncio.gather(*tasks) # 处理未达到 batch_size 的剩余任务
256
+
257
+ # 最终进度显示
258
+ final_elapsed_time = time.time() - start_time # 最终已用时间
259
+ print(
260
+ f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
261
+ f"总已用时间: {final_elapsed_time:.2f}秒 | "
262
+ f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
263
+ if processed_lines > 0
264
+ else "处理无数据"
265
+ )
266
+
267
+ self.save_processed_file(hdfs_file_path)
268
+
269
+ async def retry_process_file_bulk(self, hdfs_file_path, process_func):
270
+ """带重试机制的批量文件处理"""
271
+ retry_count = 0
272
+ while retry_count < self.retry_limit:
273
+ try:
274
+ await self.process_file_bulk(hdfs_file_path, process_func)
275
+ return True # 成功处理后退出
276
+ except Exception as e:
277
+ retry_count += 1
278
+ print(f"处理文件 {hdfs_file_path} 时发生错误: {e},正在重试 {retry_count}/{self.retry_limit}")
279
+ await asyncio.sleep(2 ** retry_count)
280
+ print(f"处理文件 {hdfs_file_path} 失败,达到重试上限")
281
+ return False
282
+
283
+ async def batch_process_file_bulk(self, hdfs_dir: str, process_func: Callable[[List[dict]], Any]):
284
+ """批量处理 gz 文件中的数据"""
285
+ gz_files = self.list_gz_files(hdfs_dir)
286
+ all_succeed = True
287
+ for hdfs_file_path in gz_files:
288
+ if self.is_file_processed(hdfs_file_path):
289
+ print(f"跳过已处理文件: {hdfs_file_path}")
290
+ continue # 跳过已处理文件
291
+ succeed = await self.retry_process_file_bulk(hdfs_file_path, process_func)
292
+ if succeed is False:
293
+ all_succeed = False
294
+
295
+ if all_succeed:
296
+ # 处理完成后删除数据库文件
297
+ try:
298
+ if os.path.exists(self.db_file):
299
+ os.remove(self.db_file)
300
+ print(f"已删除断点重试文件: {self.db_file}")
301
+ except Exception as e:
302
+ print(f"删除断点重试文件失败: {e}")
303
+
304
+ # # 使用示例
305
+ # async def update_refer(data: dict):
306
+ # ref_id = data["ref_id"]
307
+ # url = f"http://192.168.98.79:8150/v1/fact_refer/update/{ref_id}"
308
+ # update_data = data["update_data"]
309
+ # if not update_data:
310
+ # return
311
+ #
312
+ # # 此处为实际处理逻辑
313
+ # await ApiNetUtils.fetch_post(url=url, payload=update_data)
314
+ #
315
+ #
316
+ # if __name__ == "__main__":
317
+ # processor = HDFSDataProcessor() # 实例化数据处理类
318
+ # asyncio.run(processor.batch_process_file("/user/libaiyun/output/confidence", update_refer))
@@ -3,9 +3,10 @@ from typing import List
3
3
  import jieba
4
4
  from datasketch import MinHash, minhash
5
5
 
6
+ from re_common.v2.baselibrary.decorators.utils import deprecated
6
7
  from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
7
8
 
8
-
9
+ @deprecated("请使用 TextMatcherV2 中的方法代替。")
9
10
  def tokenize(text: str, stopwords=None) -> List[str]:
10
11
  """
11
12
  分词并移除停用词
@@ -32,7 +33,7 @@ def tokenize(text: str, stopwords=None) -> List[str]:
32
33
  words = [w for w in words if w not in stopwords and w.strip()]
33
34
  return words
34
35
 
35
-
36
+ @deprecated("请使用 TextMatcherV2 中的方法代替。")
36
37
  def create_minhash(words: List[str], num_perm=128) -> MinHash:
37
38
  """
38
39
  为分词结果创建 MinHash
@@ -42,7 +43,7 @@ def create_minhash(words: List[str], num_perm=128) -> MinHash:
42
43
  minhash.update(word.encode("utf-8"))
43
44
  return minhash
44
45
 
45
-
46
+ @deprecated("请使用 TextMatcherV2 中的方法代替。")
46
47
  def get_str_minhash(title):
47
48
  from re_common.v2.baselibrary.utils.string_clear import rel_clear
48
49
  rel_title = rel_clear(title)
@@ -1,10 +1,16 @@
1
+ import pickle
2
+
1
3
  import jieba
2
4
  import re
3
- from typing import List, Dict, Tuple, Set, Optional, Union
5
+ from typing import List, Dict, Tuple, Set, Optional, Union, Hashable, Protocol
4
6
  from datasketch import MinHash, MinHashLSH
5
7
 
8
+ from re_common.v2.baselibrary.decorators.utils import deprecated
9
+ from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
10
+
6
11
 
7
- class TextMatcher:
12
+ @deprecated("请使用 TextMatcherV2 代替。")
13
+ class TextMatcher(object):
8
14
  def __init__(
9
15
  self,
10
16
  threshold: float = 0.5,
@@ -188,36 +194,133 @@ class TextMatcher:
188
194
  self.doc_counter = 0
189
195
 
190
196
 
191
- if __name__ == "__main__":
192
- # 创建匹配器实例
193
- matcher = TextMatcher(
194
- threshold=0.1, # 相似度阈值
195
- num_perm=128, # MinHash 排列数
196
- )
197
+ # 定义一个协议,描述“像鸭子一样”的行为
198
+ class TokenizeDuckLike(Protocol):
199
+ def get_words(self, text) -> List:
200
+ pass
201
+
202
+
203
+ class JiebaTokenize(object):
204
+
205
+ def __init__(self, stopwords=None):
206
+ self.stopwords = stopwords
207
+
208
+ def get_words(self, text) -> List:
209
+
210
+ if self.stopwords is None:
211
+ stopwords = []
212
+ words = jieba.lcut(text)
213
+
214
+ # 统计单字符数据 长度,防止结巴分词分不了的单词 将数据分为单个字符
215
+
216
+ # 这里为什么使用函数 而不是在推导式中兼容,主要是在一些 spark中 推导式的if 条件不遵循最短路径原则会将表达式当做一个整体算子
217
+ def is_singel_en(i):
218
+ if len(i) == 1 and not is_single_cjk_char(i):
219
+ return True
220
+ return False
221
+
222
+ one_char_size = len([i for i in words if is_singel_en(i)])
223
+ all_size = len(words)
224
+ # 如果单字符个数超过一定比例 就直接用空格分词
225
+ if all_size != 0 and one_char_size / all_size > 0.6:
226
+ words = [i for i in text.split() if i.strip()]
227
+
228
+ # 过滤停用词和空字符
229
+ words = [w for w in words if w not in stopwords and w.strip()]
230
+ return words
231
+
232
+
233
+ class TextMatcherV2(object):
234
+
235
+ def __init__(
236
+ self,
237
+ threshold: float = 0.5,
238
+ num_perm: int = 128,
239
+ tdk: TokenizeDuckLike = None
240
+ ):
241
+ """
242
+ 初始化文本匹配器
243
+
244
+ Args:
245
+ threshold: LSH 相似度阈值
246
+ num_perm: MinHash 排列数
247
+ stopwords_path: 停用词文件路径
248
+ user_dict_path: 用户自定义词典路径
249
+ """
250
+ self.threshold = threshold
251
+ self.num_perm = num_perm
252
+ self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
253
+ self.tdk = tdk
254
+
255
+ def add_document(self, doc_id: str, minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None):
256
+ if isinstance(minhash, str):
257
+ minhash = self.str_to_minihash(minhash, tdk)
197
258
 
198
- # 添加单个文档
199
- doc_id = matcher.add_document(
200
- "北京是中国的首都"
201
- )
259
+ self.lsh.insert(doc_id, minhash)
202
260
 
203
- # 批量添加文档
204
- docs = {"doc1": "北京是一座现代化的大都市", "doc2": "上海是中国最大的城市", "doc3": "中国的首都是北京"}
205
- matcher.batch_add_documents(docs)
261
+ def batch_add_documents(self, betch_data: Union[list, dict], tdk: TokenizeDuckLike = None):
262
+ def _add_document(minhash_or_str, tdk):
263
+ if isinstance(minhash_or_str, str):
264
+ minhash_or_str = self.str_to_minihash(minhash_or_str, tdk)
265
+ self.add_document(docid, minhash_or_str, tdk)
266
+
267
+ if isinstance(betch_data, list):
268
+ # 必须是可解包的2个数据的元组或list
269
+ for docid, minhash_or_str in betch_data:
270
+ _add_document(minhash_or_str, tdk)
271
+ elif isinstance(betch_data, dict):
272
+ for docid, minhash_or_str in betch_data.items():
273
+ _add_document(minhash_or_str, tdk)
274
+ else:
275
+ raise Exception("数据类型错误")
276
+
277
+ def find_similar(self, query_minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None) -> List[Hashable]:
278
+ # 使用 LSH 查找候选集
279
+ if isinstance(query_minhash, str):
280
+ query_minhash = self.str_to_minihash(query_minhash, tdk)
281
+ similar_docs = self.lsh.query(query_minhash)
282
+ return similar_docs
206
283
 
207
- # 查找相似文档(不返回相似度分数)
208
- similar_docs = matcher.find_similar("北京首都")
209
- print("相似文档ID:", similar_docs)
284
+ def create_minhash(self, words: List[str], num_perm=None) -> MinHash:
285
+ """
286
+ 为分词结果创建 MinHash
287
+ """
288
+ if num_perm is None:
289
+ num_perm = self.num_perm
290
+ minhash = MinHash(num_perm=num_perm)
291
+ for word in words:
292
+ minhash.update(word.encode("utf-8"))
293
+ return minhash
210
294
 
211
- # 查找相似文档(返回相似度分数)
212
- similar_docs_with_scores = matcher.find_similar("北京首都", return_similarities=True)
213
- print("相似文档ID和分数:", similar_docs_with_scores)
295
+ def create_words(self, text: str, tdk: TokenizeDuckLike = None):
296
+ if tdk is None:
297
+ tdk = self.tdk
298
+ worlds = tdk.get_words(text)
299
+ return worlds
300
+
301
+ def str_to_minihash(self, text: str, tdk: TokenizeDuckLike = None):
302
+ if tdk is None:
303
+ tdk = self.tdk
304
+ words = self.create_words(text, tdk)
305
+ minhash = self.create_minhash(words, self.num_perm)
306
+ return minhash
214
307
 
215
- # 获取原始文本
216
- for doc_id, score in similar_docs_with_scores:
217
- print(f"文档 {doc_id}: {matcher.get_text(doc_id)} (相似度: {score:.2f})")
308
+ def minhash_dumps(self, minhash) -> bytes:
309
+ """
310
+ 序列化
311
+ """
312
+ serialized_minhash = pickle.dumps(minhash)
313
+ return serialized_minhash
218
314
 
219
- # 删除文档
220
- matcher.remove_document("doc1")
315
+ def minhash_loads(self, serialized_minhash) -> MinHash:
316
+ """
317
+ 反序列化
318
+ """
319
+ minhash = pickle.loads(serialized_minhash)
320
+ return minhash
221
321
 
222
- # 清空所有数据
223
- matcher.clear()
322
+ def merge_other_minhashlsh(self, other_minhashlsh: MinHashLSH):
323
+ """
324
+ 在其他地方创建好的lsh 合并进来
325
+ """
326
+ self.lsh.merge(other_minhashlsh)
@@ -1,6 +1,8 @@
1
1
  # 某些业务中的字符串处理 算是特定场景的工具 不算通用工具
2
2
  import re
3
3
 
4
+ from re_common.v2.baselibrary.utils.string_bool import is_all_symbols
5
+
4
6
 
5
7
  def clean_organ_postcode(organ):
6
8
  """
@@ -120,6 +122,11 @@ def deal_rel_vol(vol_str: str):
120
122
  """
121
123
  处理 期刊融合时的卷处理逻辑
122
124
  """
125
+
126
+ # 如果卷是全符号 清理掉
127
+ if is_all_symbols(vol_str):
128
+ vol_str = ""
129
+
123
130
  if vol_str.replace(".", "").isdigit():
124
131
  try:
125
132
  float_num = float(vol_str)
@@ -156,20 +163,27 @@ def deal_num_strs(input_str):
156
163
  return input_str
157
164
 
158
165
 
159
- def deal_num(strs):
166
+ def deal_num(num_str):
160
167
  """
161
168
  将 期格式化 方便 group尤其是有横杆的数据
162
169
  该方法 为融合二次分割时使用,如果场景合适也可以用于其他地方
163
170
  :param strs:
164
171
  :return:
165
172
  """
166
- strs = strs.replace("-", "_").replace(".", "_").upper()
167
- if strs.find("_") > -1:
168
- start, end = strs.split("_")
173
+ # 如果期是全符号清理掉
174
+ if is_all_symbols(num_str):
175
+ num_str = ""
176
+
177
+ if num_str.lower().startswith("n "):
178
+ num_str = num_str.lower().replace("n ", "").strip()
179
+
180
+ num_str = num_str.replace("-", "_").replace(".", "_").upper()
181
+ if num_str.find("_") > -1:
182
+ start, end = num_str.split("_")
169
183
  start = deal_num_strs(start)
170
184
  end = deal_num_strs(end)
171
- strs = start + "_" + end
185
+ num_str = start + "_" + end
172
186
  else:
173
- strs = deal_num_strs(strs)
187
+ num_str = deal_num_strs(num_str)
174
188
 
175
- return strs
189
+ return num_str
@@ -0,0 +1,180 @@
1
+ import pika
2
+
3
+
4
+ # https://blog.csdn.net/songfreeman/article/details/50943603
5
+ class BasePika(object):
6
+
7
+ def __init__(self, username=None, password=None, mqhost=None, virtual_host=None):
8
+ self.username = username
9
+ self.password = password
10
+ self.conn = None
11
+ self.host = mqhost
12
+ self.virtual_host = virtual_host
13
+ self.auto_ack = True
14
+
15
+ def set_default(self):
16
+ self.host = "192.168.31.79"
17
+ self.virtual_host = "vhost_NetDataGather"
18
+ self.username = "vip"
19
+ self.password = "piv$*123"
20
+
21
+ def connect_str(self,amqp_str):
22
+ parameters = pika.URLParameters(amqp_str)
23
+ self.conn = pika.BlockingConnection(parameters)
24
+
25
+ def connect(self):
26
+ """
27
+ 设置用户名 密码 进行连接
28
+ :return:
29
+ """
30
+ credentials = pika.PlainCredentials(self.username, self.password)
31
+ # parameters = pika.URLParameters('amqp://guest:guest@rabbit-server1:5672/%2F')
32
+ # 可以通过将 heartbeat 设为 0,关闭 rabbitmq 的心跳检测
33
+ parameters = pika.ConnectionParameters(host=self.host,
34
+ virtual_host=self.virtual_host,
35
+ credentials=credentials,
36
+ heartbeat=0)
37
+ self.conn = pika.BlockingConnection(parameters)
38
+
39
+ def close(self):
40
+ # 关闭消息队列
41
+ self.conn.close()
42
+
43
+ def create_channel(self):
44
+ self.channel = self.conn.channel()
45
+
46
+ def __del__(self):
47
+ self.channel.close()
48
+ self.conn.close()
49
+
50
+ def random_queue_declare(self):
51
+ """
52
+ 这样, result.method.queue 包含一个随机的队列名, 比如:看起来像 amq.gen-JzTY20BRgKO-HjmUJj0wLg.
53
+ 其次:
54
+ 一旦我们断开consumer连接,这个队列名将自动删除。这里有一个标识设置:
55
+ :return:
56
+ """
57
+ return self.channel.queue_declare("", exclusive=True)
58
+
59
+ def queue_declare(self, queue="hello", durable=False):
60
+ """
61
+ 创建目的地队列hello 取消息时也可以调用
62
+ 取消息和发送消息都调用 保证队列存在,也保证了不管服务端还是客户端先启动都有队列
63
+ durable True 为持久化
64
+ :return:
65
+ """
66
+ return self.channel.queue_declare(queue=queue, durable=durable)
67
+
68
+ def get_queue_size(self, queue="hello"):
69
+ """
70
+ 获取某个队列的长度
71
+ :param queue:
72
+ :return:
73
+ """
74
+ queue = self.queue_declare(queue=queue, durable=True)
75
+ return queue.method.message_count
76
+
77
+ def get_properties(self):
78
+ """
79
+ 与 queue_declare里的 durable = True 配合使用,
80
+ 设置给 easy_send_msg的properties
81
+ :return:
82
+ """
83
+ return pika.BasicProperties(
84
+ delivery_mode=2, # 设置消息为持久化的
85
+ )
86
+
87
+ def easy_send_msg(self, exchange="", routing_key="hello", body="hello world", properties=None):
88
+ """
89
+ 空字符串标识默认的或者匿名的exchange,如果存在routing_key, 消息路由到routing_key指定的队列中。
90
+ routing_key 标识发送到哪个队列,就是服务器上的队列名
91
+ body 发送的消息
92
+
93
+ basic_publish 如果 exchange 不是"" 但没有绑定队列 消息会消失
94
+ :return:
95
+ """
96
+ self.channel.basic_publish(exchange=exchange,
97
+ routing_key=routing_key,
98
+ body=body,
99
+ properties=properties)
100
+
101
+ def basic_ack(self, ch, method):
102
+ """
103
+ callback的消息确认
104
+ :param ch:
105
+ :param method:
106
+ :return:
107
+ """
108
+ ch.basic_ack(delivery_tag=method.delivery_tag)
109
+
110
+ def callback(self, ch, method, properties, body):
111
+ """
112
+ 从队列接收消息要更复杂一些,它需要为队列订阅一个 callback 函数来进行接收。
113
+ 当我们接收一个消息后,这个 callback 函数将会被 pika函数库自动调用,
114
+ 在我们的这个实例里面这个函数将用来打印接收的消息内容到屏幕
115
+ :param method:
116
+ :param properties:
117
+ :param body:
118
+ :return:
119
+ """
120
+ print(type(body))
121
+ print(" [x] Received %r" % body)
122
+ if self.auto_ack is False:
123
+ self.basic_ack(ch, method)
124
+
125
+ def set_get_msg_callback(self, routing_key="hello", callback=None, auto_ack=True):
126
+ """
127
+ 设置取消息的callback
128
+ no_ack 如果设置为True,将使用自动确认模式
129
+ no_ack 如果设置为False,在callback中确认
130
+ :return:
131
+ """
132
+ self.auto_ack = auto_ack
133
+ if callback is None:
134
+ callback = self.callback
135
+ self.channel.basic_consume(routing_key,
136
+ callback,
137
+ auto_ack=auto_ack)
138
+
139
+ def start_get_msg(self):
140
+ """
141
+ 开始取消息,会循环不停的取消息
142
+ :return:
143
+ """
144
+ self.channel.start_consuming()
145
+
146
+ def basic_qos(self, prefetch_count=1):
147
+ """
148
+ 可以提前发送几个消息来,当auto_ack=True时无效
149
+ prefetch_count==1 消息未处理完前不要发送信息的消息
150
+ :return:
151
+ """
152
+ self.channel.basic_qos(prefetch_count=prefetch_count)
153
+
154
+ def exchange_declare(self, exchangename="logs", type="fanout"):
155
+ """
156
+ fanout exchange非常简单,你从这个名字中就能猜出来,它将从Producer方收到的消息广播给所有他知道的receiver方。而这正是我们的logger记录所需要的消息。
157
+ 交换的类型
158
+ 直接交换(direct exchange)的路由算法很简单 -- 消息发送到绑定键值(binding key) 刚好完全符合路由键值( routing key) 的消息队列中。
159
+
160
+ 消息发送到一个 topic交换不能是一个任意的 routing_key -- 它必须是一个用小数点 分割的单词列表。 这个字符可以是任何单词,但是通常是指定一些连接特定消息的功能。一些有效的路由键(routing key)比如:“stock.usd.nyse",
161
+ topic 是 直接交换的升级版
162
+
163
+ headers Exchange :headers交换器允许你匹配AMQP消息的header而非路由键。除此之外,headers交换器和direct交换器完全一致,但性能会差很多。因此它并不太实用,而且几乎再也用不到了。
164
+ exchangename接下来会与队列绑定
165
+ direct , topic , headers 和 fanout
166
+ :return:
167
+ """
168
+ return self.channel.exchange_declare(exchange=exchangename,
169
+ exchange_type=type)
170
+
171
+ def queue_bind(self, exchange="logs", queue="", routing_key=""):
172
+ """
173
+ queue 临时队列获取 self.random_queue_declare().method.queue
174
+ :param exchange:
175
+ :param queue:
176
+ :return:
177
+ """
178
+ self.channel.queue_bind(exchange=exchange,
179
+ queue=queue,
180
+ routing_key=routing_key)
@@ -0,0 +1,38 @@
1
+ from contextlib import asynccontextmanager
2
+ from typing import AsyncGenerator, Tuple
3
+
4
+ import aiomysql
5
+ from aiomysql import Pool, Connection, Cursor
6
+
7
+ DB_CONFIG = {
8
+ 'host': '192.168.98.55',
9
+ 'port': 4000,
10
+ 'user': 'dataware_house_baseUser',
11
+ 'password': 'FF19AF831AEBD580B450B16BF9264200',
12
+ 'db': 'dataware_house_base',
13
+ 'charset': 'utf8mb4',
14
+ 'minsize': 16, # 最小连接数
15
+ 'maxsize': 128, # 最大连接数
16
+ 'autocommit': False, # 自动提交事务
17
+ 'pool_recycle': 3600, # 每个连接的回收时间(秒),超过此时间后连接将被关闭并重新创建,避免失效连接
18
+ 'echo': False, # 打印SQL语句
19
+ }
20
+
21
+
22
+ @asynccontextmanager
23
+ async def get_db_pool():
24
+ """异步数据库连接池管理工具"""
25
+ pool: Pool = await aiomysql.create_pool(**DB_CONFIG)
26
+ try:
27
+ yield pool
28
+ finally:
29
+ pool.close()
30
+ await pool.wait_closed()
31
+
32
+
33
+ @asynccontextmanager
34
+ async def get_session(pool: Pool) -> AsyncGenerator[Tuple[Connection, Cursor], None]:
35
+ """获取数据库会话"""
36
+ async with pool.acquire() as conn:
37
+ async with conn.cursor() as cursor:
38
+ yield conn, cursor
@@ -0,0 +1,83 @@
1
+ import logging
2
+ import traceback
3
+
4
+ from re_common.v2.baselibrary.utils.basepika import BasePika
5
+ from retry import retry
6
+
7
+ logging_logger = logging.getLogger(__name__)
8
+
9
+
10
+ class UseMq(object):
11
+
12
+ def __init__(self, queue, qos=1):
13
+ self.queue = queue
14
+ self.qos = qos
15
+ self.basepika = BasePika()
16
+ self.basepika.set_default()
17
+ self.basepika.connect()
18
+ self.basepika.create_channel()
19
+ self.basepika.queue_declare(queue=queue, durable=True)
20
+ self.basepika.basic_qos(qos)
21
+ self.properties = self.basepika.get_properties()
22
+
23
+ def re_conn(self):
24
+ """
25
+ 重新连接
26
+ :return:
27
+ """
28
+ self.basepika.connect()
29
+ self.basepika.create_channel()
30
+ self.basepika.queue_declare(queue=self.queue, durable=True)
31
+ self.basepika.basic_qos(self.qos)
32
+
33
+ @retry(delay=5, backoff=2, max_delay=60 * 3, logger=logging_logger)
34
+ def get_mq(self):
35
+ try:
36
+ if self.basepika.channel.is_closed:
37
+ logging_logger.info("重连中......")
38
+ self.re_conn()
39
+ logging_logger.info("重连完成......")
40
+ self.basepika.set_get_msg_callback(routing_key=self.queue, callback=self.callback, auto_ack=False)
41
+ self.basepika.start_get_msg()
42
+ except:
43
+ traceback.print_exc()
44
+ logging_logger.info("重连中......")
45
+ self.re_conn()
46
+
47
+ def callback(self, ch, method, properties, body):
48
+ # print(type(body))
49
+ # print(" [x] Received %r" % body)
50
+ # body = body.decode()
51
+ self.callback2(ch, method, properties, body)
52
+ if self.basepika.auto_ack is False:
53
+ self.basepika.basic_ack(ch, method)
54
+
55
+ def callback2(self, ch, method, properties, body):
56
+ pass
57
+
58
+ @retry(delay=5, backoff=2, max_delay=60 * 3, logger=logging_logger)
59
+ def send_mq(self, body, num=100):
60
+ try:
61
+ if self.basepika.get_queue_size(self.queue) < num:
62
+ self.basepika.easy_send_msg(routing_key=self.queue,
63
+ body=body,
64
+ properties=self.properties)
65
+ return True
66
+ else:
67
+ return False
68
+ except:
69
+ traceback.print_exc()
70
+ logging_logger.info("重连中......")
71
+ self.re_conn()
72
+ return False
73
+
74
+ def get_server_mq_num(self, num=100):
75
+ if self.basepika.get_queue_size(self.queue) < num:
76
+ return True
77
+ else:
78
+ return False
79
+
80
+ def easy_send_mq(self, body):
81
+ self.basepika.easy_send_msg(routing_key=self.queue,
82
+ body=body,
83
+ properties=self.properties)
@@ -1,5 +1,7 @@
1
1
  import re
2
2
 
3
+ import unicodedata
4
+
3
5
 
4
6
  def is_all_english_chars(s):
5
7
  return bool(re.match(r'^[A-Za-z]+$', s))
@@ -29,7 +31,7 @@ def is_empty(value):
29
31
  import pandas as pd
30
32
  if pd.isna(value):
31
33
  return True
32
- except ImportError:
34
+ except:
33
35
  pass # 如果没有安装 pandas,跳过
34
36
 
35
37
  # 如果是字符串,检查去除空白后是否为空
@@ -88,3 +90,13 @@ def is_single_cjk_char(char):
88
90
  if start <= code_point <= end:
89
91
  return True
90
92
  return False
93
+
94
+
95
+ def is_all_symbols(text):
96
+ # 是否全是符号
97
+ # 如果字符串为空,返回 False
98
+ if not text:
99
+ return False
100
+
101
+ # 检查每个字符是否属于符号类别
102
+ return all(unicodedata.category(char).startswith(('P', 'S')) for char in text)
@@ -1,4 +1,6 @@
1
1
  import re
2
+ from urllib.parse import unquote
3
+
2
4
  import regex
3
5
 
4
6
  from re_common.v2.baselibrary.utils.stringutils import qj2bj, bj2qj, get_diacritic_variant, clean_html, \
@@ -116,6 +118,13 @@ class StringClear(object):
116
118
  self.obj_str = remove_spaces_between_chinese_characters(self.obj_str)
117
119
  return self
118
120
 
121
+ def url_to_str(self):
122
+ """
123
+ url 编码转字符
124
+ """
125
+ self.obj_str = unquote(self.obj_str)
126
+ return self
127
+
119
128
  def get_str(self):
120
129
  return self.obj_str
121
130
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 10.0.13
3
+ Version: 10.0.15
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic
@@ -163,26 +163,33 @@ re_common/studio/streamlitstudio/first_app.py,sha256=t7Fw8YDlub7G9q99GgVo_3sPZXU
163
163
  re_common/studio/streamlitstudio/uber_pickups.py,sha256=cvrV5e8vRBM2_CpVDBE-f3V4mGFK9SqpRPZK8TEqr6U,785
164
164
  re_common/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
165
165
  re_common/v2/baselibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
166
+ re_common/v2/baselibrary/decorators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
167
+ re_common/v2/baselibrary/decorators/utils.py,sha256=Q4D6KKCQxvNBXZkPQQn14keKKJpGtg8TUSakjJU40s0,2056
166
168
  re_common/v2/baselibrary/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
167
169
  re_common/v2/baselibrary/s3object/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
168
170
  re_common/v2/baselibrary/s3object/baseboto3.py,sha256=mXuIFx99pnrPGQ4LJCZwlN1HLbaU-OWLwck0cVzW6hc,11203
169
171
  re_common/v2/baselibrary/tools/WeChatRobot.py,sha256=EaQgNncROAhU5-psYRGWAshIV5aEw-p2u1kYLpvr7RA,2796
170
172
  re_common/v2/baselibrary/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
173
+ re_common/v2/baselibrary/tools/ac_ahocorasick.py,sha256=c63y5RtKVLD37nyPCnBqfNygwRj4gTQqyIdDOrC65G0,2847
171
174
  re_common/v2/baselibrary/tools/dict_tools.py,sha256=BTh7oJuJ619IZgxiYlim0ltrXBclDtb7WzyFGr7wVf0,1246
172
175
  re_common/v2/baselibrary/tools/dolphinscheduler.py,sha256=1m7UGYDiuvJUCI6ik6CGM2fO8U5XteJzn55VRbwB9ts,7978
176
+ re_common/v2/baselibrary/tools/hdfs_data_processer.py,sha256=cChy6vhK8uSVIf3bRMGWpjociIbkiV-0j29WlZqQXHM,14207
173
177
  re_common/v2/baselibrary/tools/list_tools.py,sha256=qYxdLccRbrULOBbaPdJ_MyFFmVJGVMdW5E36nJ3ejr8,249
174
- re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=d_h9j7VxiXpcn1GHZ7L2tpx9_LDQshcl58tlKvSxZPg,1691
175
- re_common/v2/baselibrary/tools/text_matcher.py,sha256=F4WtLO-b7H6V9TIvOntCD9ZXSQP_KijPuLLYcLPtrKQ,7021
178
+ re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=2ENLtZE8opRsfkwRtTNMzITmpTsjO7wZ1ZkfkqpOH9U,1937
179
+ re_common/v2/baselibrary/tools/text_matcher.py,sha256=cPMoFxaA0-ce3tLRxVSs8_3pTYS1oVIHDnNy_AlPU-4,10756
176
180
  re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
177
- re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=__9MECbdrMnYc-ksYn2liM8vEbqF9uR4hZKqw86kW1Q,5924
181
+ re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=qY6bWcucZIU7e4yiD5-x46iCdp4HFNg_32utsysCKkc,6322
178
182
  re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
179
183
  re_common/v2/baselibrary/utils/author_smi.py,sha256=1ebH3AHv19jtJWdlqNdwu6t58HNVLCotuCB6ff1SWiw,13666
180
184
  re_common/v2/baselibrary/utils/basedict.py,sha256=sH3_RZ8u4649-jX2V1uKNNkjJVUijZBDp6SdqncOZ88,1583
181
185
  re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
186
+ re_common/v2/baselibrary/utils/basepika.py,sha256=ifOb3UsGj79k40aD9UK6-5BMPw43ZAo0SO3AYD4q4vw,7332
187
+ re_common/v2/baselibrary/utils/db.py,sha256=6HfmQHAtDm-pFFoe-ouNQggkfGRdN8Do2pN4B0ev_WU,1204
182
188
  re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
189
+ re_common/v2/baselibrary/utils/mq.py,sha256=UHpO8iNIHs91Tgp-BgnSUpZwjWquxrGLdpr3FMMv2zw,2858
183
190
  re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
184
- re_common/v2/baselibrary/utils/string_bool.py,sha256=EJnkSck4ofcIeJ6nLzAOVtlt6o1WBgvgVwIqJKj5Suc,2993
185
- re_common/v2/baselibrary/utils/string_clear.py,sha256=pGxL9PlzQDM06sC0j6U0zYRemvsJ7-OOpfzS5ETCxAs,6258
191
+ re_common/v2/baselibrary/utils/string_bool.py,sha256=0JxzftuL61UAF-2Vp9F1Og8kXp_y647KJC5jXus9QwM,3278
192
+ re_common/v2/baselibrary/utils/string_clear.py,sha256=1QAb_IC8FoVL5KzXhPicz4stsYD7LyASh5sXaXfs084,6445
186
193
  re_common/v2/baselibrary/utils/string_smi.py,sha256=cU0WAWHRGnGoVQx3eCEKeM_q_olFNzRTJe7rSe586SY,741
187
194
  re_common/v2/baselibrary/utils/stringutils.py,sha256=WuxhXJVU6xuGfgHiSjxrn7Go1eobpa8DMR3Icoey4vo,6039
188
195
  re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -211,8 +218,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
211
218
  re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
212
219
  re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
213
220
  re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
214
- re_common-10.0.13.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
215
- re_common-10.0.13.dist-info/METADATA,sha256=QdkCM_LoID9na9D2qL4Zamec2QOdKtXgOxCWBSZQO8k,582
216
- re_common-10.0.13.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
217
- re_common-10.0.13.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
218
- re_common-10.0.13.dist-info/RECORD,,
221
+ re_common-10.0.15.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
222
+ re_common-10.0.15.dist-info/METADATA,sha256=IhfGSUxRXpHVDZv-ZwqvSxD6yiI_WbeDEHVDpn-RvyU,582
223
+ re_common-10.0.15.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
224
+ re_common-10.0.15.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
225
+ re_common-10.0.15.dist-info/RECORD,,