re-common 10.0.16__py3-none-any.whl → 10.0.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/v2/baselibrary/tools/hdfs_data_processer.py +41 -21
- re_common/v2/baselibrary/tools/list_tools.py +14 -0
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +8 -1
- re_common/v2/baselibrary/utils/base_string_similarity.py +158 -0
- re_common/v2/baselibrary/utils/string_clear.py +10 -0
- {re_common-10.0.16.dist-info → re_common-10.0.18.dist-info}/METADATA +10 -2
- {re_common-10.0.16.dist-info → re_common-10.0.18.dist-info}/RECORD +10 -9
- {re_common-10.0.16.dist-info → re_common-10.0.18.dist-info}/WHEEL +1 -1
- {re_common-10.0.16.dist-info → re_common-10.0.18.dist-info}/LICENSE +0 -0
- {re_common-10.0.16.dist-info → re_common-10.0.18.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import gzip
|
|
3
3
|
import json
|
|
4
|
+
from pathlib import Path
|
|
4
5
|
import sqlite3
|
|
5
6
|
import time
|
|
6
7
|
import os
|
|
@@ -12,12 +13,12 @@ from hdfs import InsecureClient
|
|
|
12
13
|
|
|
13
14
|
class HDFSDataProcessor:
|
|
14
15
|
def __init__(
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
16
|
+
self,
|
|
17
|
+
hdfs_url="http://VIP-DC-MASTER-2:9870",
|
|
18
|
+
hdfs_user="root",
|
|
19
|
+
db_file="processed_files.db",
|
|
20
|
+
batch_size=50,
|
|
21
|
+
retry_limit=3,
|
|
21
22
|
):
|
|
22
23
|
self.hdfs_url = hdfs_url
|
|
23
24
|
self.hdfs_user = hdfs_user
|
|
@@ -44,14 +45,20 @@ class HDFSDataProcessor:
|
|
|
44
45
|
"""保存处理过的文件"""
|
|
45
46
|
with sqlite3.connect(self.db_file) as conn:
|
|
46
47
|
cursor = conn.cursor()
|
|
47
|
-
cursor.execute(
|
|
48
|
+
cursor.execute(
|
|
49
|
+
"INSERT OR IGNORE INTO processed_files (file_path) VALUES (?)",
|
|
50
|
+
(file_path,),
|
|
51
|
+
)
|
|
48
52
|
conn.commit()
|
|
49
53
|
|
|
50
54
|
def is_file_processed(self, file_path):
|
|
51
55
|
"""检查文件是否已处理"""
|
|
52
56
|
with sqlite3.connect(self.db_file) as conn:
|
|
53
57
|
cursor = conn.cursor()
|
|
54
|
-
cursor.execute(
|
|
58
|
+
cursor.execute(
|
|
59
|
+
"SELECT file_path FROM processed_files WHERE file_path = ?",
|
|
60
|
+
(file_path,),
|
|
61
|
+
)
|
|
55
62
|
result = cursor.fetchone()
|
|
56
63
|
return result is not None
|
|
57
64
|
|
|
@@ -81,7 +88,7 @@ class HDFSDataProcessor:
|
|
|
81
88
|
break
|
|
82
89
|
yield lines
|
|
83
90
|
|
|
84
|
-
def all_read_gz(self, gz_file_path: str, encoding=
|
|
91
|
+
def all_read_gz(self, gz_file_path: str, encoding="utf-8"):
|
|
85
92
|
"""
|
|
86
93
|
读取 HDFS 上的 .gz 文件内容。
|
|
87
94
|
:param hdfs_path: HDFS 文件路径(必须以 .gz 结尾)
|
|
@@ -94,7 +101,7 @@ class HDFSDataProcessor:
|
|
|
94
101
|
content = gz_file.read().decode(encoding) # 解码为字符串
|
|
95
102
|
print(f"文件读取成功: {gz_file_path}")
|
|
96
103
|
lines = [i for i in content.split("\n") if i.strip()]
|
|
97
|
-
result = [lines[i:i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
104
|
+
result = [lines[i : i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
98
105
|
return result
|
|
99
106
|
|
|
100
107
|
async def process_data(self, data, process_func):
|
|
@@ -102,19 +109,19 @@ class HDFSDataProcessor:
|
|
|
102
109
|
retry_count = 0
|
|
103
110
|
while retry_count < self.retry_limit:
|
|
104
111
|
try:
|
|
105
|
-
await process_func(data)
|
|
106
|
-
return # 成功处理后退出
|
|
112
|
+
return await process_func(data) # 成功处理后退出
|
|
107
113
|
except Exception as e:
|
|
108
114
|
retry_count += 1
|
|
109
115
|
print(f"处理数据时发生错误: {e}, 正在重试 {retry_count}/{self.retry_limit}, data: {data}")
|
|
110
|
-
await asyncio.sleep(2
|
|
116
|
+
await asyncio.sleep(2**retry_count)
|
|
111
117
|
print(f"处理数据失败, 达到重试上限, data: {data}")
|
|
112
118
|
|
|
113
|
-
async def process_file(self, hdfs_file_path, process_func):
|
|
119
|
+
async def process_file(self, hdfs_file_path, process_func, write_dir: str):
|
|
114
120
|
"""处理单个 gz 文件"""
|
|
115
121
|
total_lines = self.count_total_lines(hdfs_file_path)
|
|
116
122
|
processed_lines = 0
|
|
117
123
|
start_time = time.time()
|
|
124
|
+
results = []
|
|
118
125
|
# # 这里根据不同的配置选用不同的读取文件的方法
|
|
119
126
|
for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
|
|
120
127
|
processing_start_time = time.time() # 记录本批处理开始时间
|
|
@@ -128,7 +135,7 @@ class HDFSDataProcessor:
|
|
|
128
135
|
print(f"解析JSON失败: {e}, 行内容: {line.strip()}")
|
|
129
136
|
|
|
130
137
|
# await AsyncTaskPool(self.batch_size).run(tasks) # AsyncTaskPool 适用于一次提交所有任务, 限制并发数执行
|
|
131
|
-
await asyncio.gather(*tasks)
|
|
138
|
+
results.extend(await asyncio.gather(*tasks))
|
|
132
139
|
|
|
133
140
|
processed_lines += len(lines)
|
|
134
141
|
|
|
@@ -152,6 +159,18 @@ class HDFSDataProcessor:
|
|
|
152
159
|
f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
|
|
153
160
|
)
|
|
154
161
|
|
|
162
|
+
def generate_write_data(results):
|
|
163
|
+
for res in results:
|
|
164
|
+
yield str(res) + "\n"
|
|
165
|
+
|
|
166
|
+
if write_dir is not None:
|
|
167
|
+
self.client.write(
|
|
168
|
+
write_dir.rstrip("/") + f"/{Path(hdfs_file_path).stem}",
|
|
169
|
+
data=generate_write_data(results),
|
|
170
|
+
overwrite=True,
|
|
171
|
+
encoding="utf-8",
|
|
172
|
+
)
|
|
173
|
+
|
|
155
174
|
# 最终进度显示
|
|
156
175
|
final_elapsed_time = time.time() - start_time # 最终已用时间
|
|
157
176
|
print(
|
|
@@ -164,22 +183,22 @@ class HDFSDataProcessor:
|
|
|
164
183
|
|
|
165
184
|
self.save_processed_file(hdfs_file_path) # 保存处理过的文件
|
|
166
185
|
|
|
167
|
-
async def retry_process_file(self, hdfs_file_path, process_func):
|
|
186
|
+
async def retry_process_file(self, hdfs_file_path, process_func, write_dir):
|
|
168
187
|
"""带重试机制的文件处理"""
|
|
169
188
|
retry_count = 0
|
|
170
189
|
while retry_count < self.retry_limit:
|
|
171
190
|
try:
|
|
172
|
-
await self.process_file(hdfs_file_path, process_func)
|
|
191
|
+
await self.process_file(hdfs_file_path, process_func, write_dir)
|
|
173
192
|
return True # 成功处理后退出
|
|
174
193
|
except Exception as e:
|
|
175
194
|
retry_count += 1
|
|
176
195
|
print(f"处理文件 {hdfs_file_path} 时发生错误: {e},正在重试 {retry_count}/{self.retry_limit}")
|
|
177
|
-
await asyncio.sleep(2
|
|
196
|
+
await asyncio.sleep(2**retry_count)
|
|
178
197
|
print(f"处理文件 {hdfs_file_path} 失败,达到重试上限")
|
|
179
198
|
return False
|
|
180
199
|
# raise
|
|
181
200
|
|
|
182
|
-
async def batch_process_file(self, hdfs_dir: str, process_func: Callable[[dict], Any]):
|
|
201
|
+
async def batch_process_file(self, hdfs_dir: str, process_func: Callable[[dict], Any], write_dir: str = None):
|
|
183
202
|
"""批量更新所有 gz 文件"""
|
|
184
203
|
gz_files = self.list_gz_files(hdfs_dir)
|
|
185
204
|
all_succeed = True
|
|
@@ -187,7 +206,7 @@ class HDFSDataProcessor:
|
|
|
187
206
|
if self.is_file_processed(hdfs_file_path):
|
|
188
207
|
print(f"跳过已处理文件: {hdfs_file_path}")
|
|
189
208
|
continue # 如果文件已处理,跳过
|
|
190
|
-
succeed = await self.retry_process_file(hdfs_file_path, process_func) # 处理文件
|
|
209
|
+
succeed = await self.retry_process_file(hdfs_file_path, process_func, write_dir) # 处理文件
|
|
191
210
|
if succeed is False:
|
|
192
211
|
all_succeed = False
|
|
193
212
|
|
|
@@ -276,7 +295,7 @@ class HDFSDataProcessor:
|
|
|
276
295
|
except Exception as e:
|
|
277
296
|
retry_count += 1
|
|
278
297
|
print(f"处理文件 {hdfs_file_path} 时发生错误: {e},正在重试 {retry_count}/{self.retry_limit}")
|
|
279
|
-
await asyncio.sleep(2
|
|
298
|
+
await asyncio.sleep(2**retry_count)
|
|
280
299
|
print(f"处理文件 {hdfs_file_path} 失败,达到重试上限")
|
|
281
300
|
return False
|
|
282
301
|
|
|
@@ -301,6 +320,7 @@ class HDFSDataProcessor:
|
|
|
301
320
|
except Exception as e:
|
|
302
321
|
print(f"删除断点重试文件失败: {e}")
|
|
303
322
|
|
|
323
|
+
|
|
304
324
|
# # 使用示例
|
|
305
325
|
# async def update_refer(data: dict):
|
|
306
326
|
# ref_id = data["ref_id"]
|
|
@@ -50,3 +50,17 @@ def filter_and_sort_by_smi(all_list, top_n=1000):
|
|
|
50
50
|
|
|
51
51
|
# 3. 取前 top_n 个
|
|
52
52
|
return unique_list[:top_n]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def list_to_dict(list_data,key_name):
|
|
56
|
+
# 使用 defaultdict 来处理重复 id
|
|
57
|
+
from collections import defaultdict
|
|
58
|
+
|
|
59
|
+
dict_data = defaultdict(list)
|
|
60
|
+
|
|
61
|
+
for item in list_data:
|
|
62
|
+
dict_data[item[key_name]].append(item)
|
|
63
|
+
|
|
64
|
+
# 将 defaultdict 转换成普通字典
|
|
65
|
+
dict_data = dict(dict_data)
|
|
66
|
+
return dict_data
|
|
@@ -177,7 +177,14 @@ def deal_num(num_str):
|
|
|
177
177
|
if num_str.lower().startswith("n "):
|
|
178
178
|
num_str = num_str.lower().replace("n ", "").strip()
|
|
179
179
|
|
|
180
|
+
num_str = num_str.lower().replace("special_issue_", '').replace("_special_issue", '').replace("issue", "")
|
|
181
|
+
num_str = num_str.replace("spec.", "").replace("iss.", "").replace("spl.", "").replace("special.", "").replace(
|
|
182
|
+
"specialissue.", "")
|
|
183
|
+
num_str = num_str.replace("spec", "").replace("iss", "").replace("spl", "").replace("special", "").replace(
|
|
184
|
+
"specialissue", '')
|
|
185
|
+
|
|
180
186
|
num_str = num_str.replace("-", "_").replace(".", "_").upper()
|
|
187
|
+
num_str = num_str.lstrip("_").rstrip("_")
|
|
181
188
|
if num_str.find("_") > -1:
|
|
182
189
|
start, end = num_str.split("_")
|
|
183
190
|
start = deal_num_strs(start)
|
|
@@ -186,4 +193,4 @@ def deal_num(num_str):
|
|
|
186
193
|
else:
|
|
187
194
|
num_str = deal_num_strs(num_str)
|
|
188
195
|
|
|
189
|
-
return num_str
|
|
196
|
+
return num_str.lower().strip()
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import Levenshtein
|
|
2
|
+
import jellyfish
|
|
3
|
+
from rapidfuzz.distance import DamerauLevenshtein, Hamming, Indel, LCSseq, OSA
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BaseStringSimilarity(object):
|
|
7
|
+
|
|
8
|
+
@classmethod
|
|
9
|
+
def levenshtein_similarity(cls, str1, str2) -> float:
|
|
10
|
+
"""
|
|
11
|
+
返回 两个字字符串之间的编辑距离 分数
|
|
12
|
+
"""
|
|
13
|
+
# 编辑距离长度
|
|
14
|
+
distance = Levenshtein.distance(str1, str2)
|
|
15
|
+
# 以最长字符串为除数算分
|
|
16
|
+
similarity = 1 - (distance / max(len(str1), len(str2)))
|
|
17
|
+
return similarity
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
def damerau_normalized_distance_similarity(cls, str1, str2) -> float:
|
|
21
|
+
"""
|
|
22
|
+
# 计算 归一化的编辑距离,取值范围 [0, 1],值越小表示越相似。 一般不以小评估分 所以不用
|
|
23
|
+
similarity = DamerauLevenshtein.normalized_distance(str1, str2)
|
|
24
|
+
作用:计算 相似度得分,取值范围 [0, max_len],值越大表示越相似。
|
|
25
|
+
print(DamerauLevenshtein.similarity(str1, str2))
|
|
26
|
+
"""
|
|
27
|
+
# 该算法与 cls.levenshtein_similarity 算法一致 只是 编辑距离的得值不一样
|
|
28
|
+
similarity = DamerauLevenshtein.normalized_similarity(str1, str2)
|
|
29
|
+
return similarity
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def indel_levenshtein_similarity(cls, str1, str2) -> float:
|
|
33
|
+
"""
|
|
34
|
+
本质上使用的 是 Indel.normalized_similarity(str1,str2) 方法
|
|
35
|
+
|
|
36
|
+
计算 str1 和 str2 之间的 Indel 距离(插入和删除操作的最小次数)
|
|
37
|
+
Indel.distance(str1, str2)
|
|
38
|
+
计算 标准化后的 Indel 距离,取值范围在 [0, 1] 之间,其中 0 表示完全相同,1 表示完全不同。 ``distance / (len1 + len2)``.
|
|
39
|
+
Indel.normalized_distance(str1, str2)
|
|
40
|
+
计算 [max, 0] 范围内的 Indel 相似度。计算公式为“(len1 + len2) - distance”
|
|
41
|
+
Indel.similarity(str1, str2)
|
|
42
|
+
计算 [0, 1] 范围内的归一化插入/缺失相似度。计算公式为“1 - normalized_distance”
|
|
43
|
+
Indel.normalized_similarity(str1, str2)
|
|
44
|
+
|
|
45
|
+
"""
|
|
46
|
+
# 计算相似度(0到1之间的值,1表示完全相同)
|
|
47
|
+
similarity = Levenshtein.ratio(str1, str2)
|
|
48
|
+
return similarity
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def jaro_similarity(cls, str1, str2) -> float:
|
|
52
|
+
"""
|
|
53
|
+
Jaro 相似度是一种用于测量两个字符串相似度的算法,主要考虑:
|
|
54
|
+
匹配的字符
|
|
55
|
+
字符顺序
|
|
56
|
+
字符转置(位置交换)
|
|
57
|
+
|
|
58
|
+
与 Jaro.normalized_similarity(str1,str2) 一致
|
|
59
|
+
"""
|
|
60
|
+
return jellyfish.jaro_similarity(str1, str2)
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def jaro_winkler_similarity(cls, str1, str2) -> float:
|
|
64
|
+
"""
|
|
65
|
+
Jaro-Winkler 是 Jaro 的改进版,对前缀匹配给予更多权重
|
|
66
|
+
|
|
67
|
+
与 JaroWinkler.normalized_similarity(str1,str2) 结果一致
|
|
68
|
+
|
|
69
|
+
print(JaroWinkler.distance(str1, str2))
|
|
70
|
+
与 print(JaroWinkler.normalized_distance(str1, str2)) 结果一致
|
|
71
|
+
|
|
72
|
+
print(JaroWinkler.similarity(str1, str2))
|
|
73
|
+
与 print(JaroWinkler.normalized_similarity(str1,str2)) 结果一致
|
|
74
|
+
"""
|
|
75
|
+
return jellyfish.jaro_winkler_similarity(str1, str2)
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
def osa_similarity(cls, str1, str2) -> float:
|
|
79
|
+
"""
|
|
80
|
+
计算 [0, 1] 范围内的归一化最佳字符串比对 (OSA) 相似度。
|
|
81
|
+
|
|
82
|
+
计算公式为“1 - normalized_distance”
|
|
83
|
+
"""
|
|
84
|
+
return OSA.normalized_similarity(str1, str2)
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def lcs_seq_similarity(cls, str1, str2) -> float:
|
|
88
|
+
"""
|
|
89
|
+
计算 [0, 1] 范围内的归一化 LCS 相似度。
|
|
90
|
+
计算公式为“1 - normalized_distance”
|
|
91
|
+
"""
|
|
92
|
+
return LCSseq.normalized_similarity(str1, str2)
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
def lcs_seq_distance(cls, str1, str2) -> int:
|
|
96
|
+
"""
|
|
97
|
+
LCSseq.distance 是 RapidFuzz 库中的一个方法,用于计算两个字符串之间的 最长公共子序列(Longest Common Subsequence, LCS)距离。
|
|
98
|
+
LCS 是指两个字符串中 按顺序出现但不一定连续 的最长子序列。例如:
|
|
99
|
+
"abcde" 和 "ace" 的 LCS 是 "ace"(长度 3)。
|
|
100
|
+
"Druitt, Robert" 和 "Druitt R." 的 LCS 可能是 "Druitt R"(长度 8)。
|
|
101
|
+
计算 [0, max] 范围内的 LCS 距离。
|
|
102
|
+
计算公式为“max(len1, len2) - 相似度”。
|
|
103
|
+
"""
|
|
104
|
+
return LCSseq.distance(str1, str2)
|
|
105
|
+
|
|
106
|
+
@classmethod
|
|
107
|
+
def osa_distance(cls, str1, str2) -> int:
|
|
108
|
+
"""
|
|
109
|
+
OSA.distance(Optimal String Alignment,最优字符串对齐距离)是 RapidFuzz 库中的一个方法,用于计算两个字符串之间的 编辑距离(Edit Distance),但比标准的 Levenshtein 距离 限制更严格。
|
|
110
|
+
|
|
111
|
+
OSA 额外允许 相邻字符交换(Transposition),但限制比 Damerau-Levenshtein 更严格(Damerau 允许多次交换,而 OSA 仅限一次)。
|
|
112
|
+
"""
|
|
113
|
+
return OSA.distance(str1, str2)
|
|
114
|
+
|
|
115
|
+
@classmethod
|
|
116
|
+
def levenshtein_distance(cls, str1, str2) -> int:
|
|
117
|
+
"""
|
|
118
|
+
返回 两个字字符串之间的编辑距离 分数
|
|
119
|
+
标准 Levenshtein 距离 允许 插入、删除、替换 三种操作,但不允许 相邻字符交换(transposition)
|
|
120
|
+
|
|
121
|
+
jellyfish.levenshtein_distance(str1,str2) 该方法结果与 本方法一致
|
|
122
|
+
|
|
123
|
+
print(Jaro.distance(str1, str2))
|
|
124
|
+
与 print(Jaro.normalized_distance(str1, str2)) 结果一致
|
|
125
|
+
|
|
126
|
+
print(Jaro.similarity(str1, str2))
|
|
127
|
+
与 print(Jaro.normalized_similarity(str1,str2)) 结果一致
|
|
128
|
+
"""
|
|
129
|
+
# 编辑距离长度
|
|
130
|
+
distance = Levenshtein.distance(str1, str2)
|
|
131
|
+
print(jellyfish.levenshtein_distance(str1, str2))
|
|
132
|
+
return distance
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
def indel_distance(cls, str1, str2) -> int:
|
|
136
|
+
"""
|
|
137
|
+
Indel(Insertion + Deletion)距离是 仅考虑插入和删除操作 的编辑距离,不考虑替换操作。
|
|
138
|
+
"""
|
|
139
|
+
return Indel.distance(str1, str2)
|
|
140
|
+
|
|
141
|
+
@classmethod
|
|
142
|
+
def damerau_levenshtein_distance(cls, str1, str2) -> int:
|
|
143
|
+
"""
|
|
144
|
+
Damerau-Levenshtein 距离是 Levenshtein 距离的修改,它将换位(例如将 ifsh 表示为 fish)计为一次编辑
|
|
145
|
+
"""
|
|
146
|
+
# 编辑距离长度
|
|
147
|
+
distance = jellyfish.damerau_levenshtein_distance(str1, str2)
|
|
148
|
+
print(DamerauLevenshtein.distance(str1, str2))
|
|
149
|
+
return distance
|
|
150
|
+
|
|
151
|
+
@classmethod
|
|
152
|
+
def hamming_distance(cls, str1, str2) -> int:
|
|
153
|
+
return Hamming.distance(str1, str2)
|
|
154
|
+
|
|
155
|
+
# str1 = "primulina elegant ladyis a new culitvar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate"
|
|
156
|
+
# str2 = "primulinaelegant labyis a new cultivar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate 2019 editorial office of acta horticulturae sinica all rights reserved"
|
|
157
|
+
# # str1 = "primulina elegant ladyis a new cultivar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate"
|
|
158
|
+
# # str2 = "primulinaelegant ladyis a new cultivar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate 2019 editorial office of acta horticulturae sinica all rights reserved"
|
|
@@ -6,6 +6,11 @@ import regex
|
|
|
6
6
|
from re_common.v2.baselibrary.utils.stringutils import qj2bj, bj2qj, get_diacritic_variant, clean_html, \
|
|
7
7
|
remove_spaces_between_chinese_characters
|
|
8
8
|
|
|
9
|
+
from opencc import OpenCC
|
|
10
|
+
|
|
11
|
+
# pip install opencc-python-reimplemented
|
|
12
|
+
cc = OpenCC("t2s") # t2s是繁体转简体
|
|
13
|
+
|
|
9
14
|
|
|
10
15
|
class StringClear(object):
|
|
11
16
|
|
|
@@ -31,6 +36,11 @@ class StringClear(object):
|
|
|
31
36
|
self.obj_str = bj2qj(self.obj_str)
|
|
32
37
|
return self
|
|
33
38
|
|
|
39
|
+
def convert_to_simplified(self):
|
|
40
|
+
# 繁体转简体
|
|
41
|
+
self.obj_str = cc.convert(self.obj_str)
|
|
42
|
+
return self
|
|
43
|
+
|
|
34
44
|
def lower(self):
|
|
35
45
|
self.obj_str = self.obj_str.lower()
|
|
36
46
|
return self
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: re_common
|
|
3
|
-
Version: 10.0.
|
|
3
|
+
Version: 10.0.18
|
|
4
4
|
Summary: a library about all python projects
|
|
5
5
|
Home-page: https://gitee.com/xujiangios/re-common
|
|
6
6
|
Author: vic
|
|
@@ -11,6 +11,14 @@ Classifier: Operating System :: OS Independent
|
|
|
11
11
|
Requires-Python: >=3.6
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
|
+
Dynamic: author
|
|
15
|
+
Dynamic: author-email
|
|
16
|
+
Dynamic: classifier
|
|
17
|
+
Dynamic: description
|
|
18
|
+
Dynamic: description-content-type
|
|
19
|
+
Dynamic: home-page
|
|
20
|
+
Dynamic: requires-python
|
|
21
|
+
Dynamic: summary
|
|
14
22
|
|
|
15
23
|
|
|
16
24
|
这是一个基础类,依赖很多的第三方包,是一个用得到的第三方库的封装,可以在此基础上迅速构建项目
|
|
@@ -173,14 +173,15 @@ re_common/v2/baselibrary/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
|
|
|
173
173
|
re_common/v2/baselibrary/tools/ac_ahocorasick.py,sha256=c63y5RtKVLD37nyPCnBqfNygwRj4gTQqyIdDOrC65G0,2847
|
|
174
174
|
re_common/v2/baselibrary/tools/dict_tools.py,sha256=BTh7oJuJ619IZgxiYlim0ltrXBclDtb7WzyFGr7wVf0,1246
|
|
175
175
|
re_common/v2/baselibrary/tools/dolphinscheduler.py,sha256=1m7UGYDiuvJUCI6ik6CGM2fO8U5XteJzn55VRbwB9ts,7978
|
|
176
|
-
re_common/v2/baselibrary/tools/hdfs_data_processer.py,sha256=
|
|
177
|
-
re_common/v2/baselibrary/tools/list_tools.py,sha256=
|
|
176
|
+
re_common/v2/baselibrary/tools/hdfs_data_processer.py,sha256=CoavcREurIAYlnVXWX8gmuE0jCobytsZjsf3OJi581A,14793
|
|
177
|
+
re_common/v2/baselibrary/tools/list_tools.py,sha256=mZyrOGdW6tuany0lKQOD4P739xikvmeKm1VSzo37Byc,1973
|
|
178
178
|
re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=2ENLtZE8opRsfkwRtTNMzITmpTsjO7wZ1ZkfkqpOH9U,1937
|
|
179
179
|
re_common/v2/baselibrary/tools/text_matcher.py,sha256=cPMoFxaA0-ce3tLRxVSs8_3pTYS1oVIHDnNy_AlPU-4,10756
|
|
180
180
|
re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
|
|
181
|
-
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=
|
|
181
|
+
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=njPcRgeBWpnZr5u2cPAO4qdWBq-CgTn99rJuvWFcChk,6788
|
|
182
182
|
re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
183
183
|
re_common/v2/baselibrary/utils/author_smi.py,sha256=1ebH3AHv19jtJWdlqNdwu6t58HNVLCotuCB6ff1SWiw,13666
|
|
184
|
+
re_common/v2/baselibrary/utils/base_string_similarity.py,sha256=a40a79ttwoG_gC_hxMNB-sMXXecgICoRDWrj0DW8iEE,7749
|
|
184
185
|
re_common/v2/baselibrary/utils/basedict.py,sha256=sH3_RZ8u4649-jX2V1uKNNkjJVUijZBDp6SdqncOZ88,1583
|
|
185
186
|
re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
|
|
186
187
|
re_common/v2/baselibrary/utils/basepika.py,sha256=ifOb3UsGj79k40aD9UK6-5BMPw43ZAo0SO3AYD4q4vw,7332
|
|
@@ -189,7 +190,7 @@ re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtb
|
|
|
189
190
|
re_common/v2/baselibrary/utils/mq.py,sha256=UHpO8iNIHs91Tgp-BgnSUpZwjWquxrGLdpr3FMMv2zw,2858
|
|
190
191
|
re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
|
|
191
192
|
re_common/v2/baselibrary/utils/string_bool.py,sha256=S5HMemxl2S248p4sEakD1NFJccH1NMQcbOFGmSFfcbg,4695
|
|
192
|
-
re_common/v2/baselibrary/utils/string_clear.py,sha256=
|
|
193
|
+
re_common/v2/baselibrary/utils/string_clear.py,sha256=23UB2YAcSJetGmqFzpHBwaigm0e65I6BvIsJs8WgvCw,6701
|
|
193
194
|
re_common/v2/baselibrary/utils/string_smi.py,sha256=cU0WAWHRGnGoVQx3eCEKeM_q_olFNzRTJe7rSe586SY,741
|
|
194
195
|
re_common/v2/baselibrary/utils/stringutils.py,sha256=WuxhXJVU6xuGfgHiSjxrn7Go1eobpa8DMR3Icoey4vo,6039
|
|
195
196
|
re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -218,8 +219,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
|
|
|
218
219
|
re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
|
|
219
220
|
re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
|
|
220
221
|
re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
|
|
221
|
-
re_common-10.0.
|
|
222
|
-
re_common-10.0.
|
|
223
|
-
re_common-10.0.
|
|
224
|
-
re_common-10.0.
|
|
225
|
-
re_common-10.0.
|
|
222
|
+
re_common-10.0.18.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
223
|
+
re_common-10.0.18.dist-info/METADATA,sha256=QyJZdyBXGoDoLqMDXfnRiMHBEaLlop35pjoGq9PbZyE,764
|
|
224
|
+
re_common-10.0.18.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
225
|
+
re_common-10.0.18.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
|
|
226
|
+
re_common-10.0.18.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|