re-common 10.0.0__py3-none-any.whl → 10.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +45 -3
- re_common/v2/baselibrary/utils/author_smi.py +23 -10
- re_common/v2/baselibrary/utils/string_bool.py +17 -0
- re_common/v2/baselibrary/utils/string_clear.py +28 -1
- re_common/v2/baselibrary/utils/stringutils.py +1 -0
- {re_common-10.0.0.dist-info → re_common-10.0.2.dist-info}/METADATA +1 -1
- {re_common-10.0.0.dist-info → re_common-10.0.2.dist-info}/RECORD +10 -10
- {re_common-10.0.0.dist-info → re_common-10.0.2.dist-info}/LICENSE +0 -0
- {re_common-10.0.0.dist-info → re_common-10.0.2.dist-info}/WHEEL +0 -0
- {re_common-10.0.0.dist-info → re_common-10.0.2.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# 某些业务中的字符串处理 算是特定场景的工具
|
|
1
|
+
# 某些业务中的字符串处理 算是特定场景的工具 不算通用工具
|
|
2
2
|
import re
|
|
3
3
|
|
|
4
4
|
|
|
@@ -56,7 +56,7 @@ def get_first_organ(organ):
|
|
|
56
56
|
for organ_one in organ_list:
|
|
57
57
|
# 清理邮政编码
|
|
58
58
|
organ_one = clean_organ_postcode(organ_one)
|
|
59
|
-
if organ_one:
|
|
59
|
+
if organ_one.strip():
|
|
60
60
|
return organ_one
|
|
61
61
|
|
|
62
62
|
return ""
|
|
@@ -69,6 +69,48 @@ def get_first_author(author: str) -> str:
|
|
|
69
69
|
for au in au_list:
|
|
70
70
|
au = re.sub("\\[.*?]", "", au)
|
|
71
71
|
au = re.sub("\\(.*?\\)", "", au)
|
|
72
|
-
if au:
|
|
72
|
+
if au.strip():
|
|
73
73
|
return au
|
|
74
74
|
return ""
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_author_list(author: str):
|
|
78
|
+
lists = []
|
|
79
|
+
if not author:
|
|
80
|
+
return []
|
|
81
|
+
au_list = author.strip().split(";")
|
|
82
|
+
for au in au_list:
|
|
83
|
+
au = re.sub("\\[.*?]", "", au)
|
|
84
|
+
au = re.sub("\\(.*?\\)", "", au)
|
|
85
|
+
if au.strip():
|
|
86
|
+
lists.append(au.strip())
|
|
87
|
+
return lists
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_scopus_author_abbr(author_row: str):
|
|
91
|
+
if not author_row:
|
|
92
|
+
return ""
|
|
93
|
+
author_list = author_row.split("&&")
|
|
94
|
+
if len(author_list) != 3:
|
|
95
|
+
raise Exception("错误的数据个数 可能来自其他数据源")
|
|
96
|
+
|
|
97
|
+
abbr_list = author_list[0].strip().split(";")
|
|
98
|
+
abbr_list = [author.strip() for author in abbr_list if
|
|
99
|
+
author.strip() and author.strip().lower() not in ("*", "and")]
|
|
100
|
+
return ";".join(abbr_list)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def get_wos_author_abbr(author_row: str):
|
|
104
|
+
if not author_row:
|
|
105
|
+
return ""
|
|
106
|
+
author_list = author_row.split("&&")
|
|
107
|
+
if len(author_list) != 4:
|
|
108
|
+
raise Exception("错误的数据个数 可能来自其他数据源")
|
|
109
|
+
abbr_list = []
|
|
110
|
+
abbr_list_au = author_list[0].strip().split(";")
|
|
111
|
+
abbr_list_ba = author_list[2].strip().split(";")
|
|
112
|
+
abbr_list.extend(abbr_list_au)
|
|
113
|
+
abbr_list.extend(abbr_list_ba)
|
|
114
|
+
abbr_list = [author.strip() for author in abbr_list if
|
|
115
|
+
author.strip() and author.strip().lower() not in ("*", "and")]
|
|
116
|
+
return ";".join(abbr_list)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import copy
|
|
1
2
|
import re
|
|
2
3
|
import string
|
|
3
4
|
|
|
@@ -126,11 +127,6 @@ def custom_rstrip(s):
|
|
|
126
127
|
return s
|
|
127
128
|
|
|
128
129
|
|
|
129
|
-
def deal_str_first(s1):
|
|
130
|
-
# 先对数据处理一波
|
|
131
|
-
s1 = s1.replace("’", "")
|
|
132
|
-
return s1
|
|
133
|
-
|
|
134
130
|
|
|
135
131
|
def AuthorRatio(
|
|
136
132
|
s1,
|
|
@@ -146,7 +142,7 @@ def AuthorRatio(
|
|
|
146
142
|
if is_none(s1) or is_none(s2):
|
|
147
143
|
return 0
|
|
148
144
|
|
|
149
|
-
# 处理字符串的程序
|
|
145
|
+
# 处理字符串的程序 外围传入方法
|
|
150
146
|
if processor is not None:
|
|
151
147
|
s1 = processor(s1)
|
|
152
148
|
s2 = processor(s2)
|
|
@@ -154,12 +150,14 @@ def AuthorRatio(
|
|
|
154
150
|
# 处理后是否为空字符串,如果有 返回0
|
|
155
151
|
if not s1 or not s2:
|
|
156
152
|
return 0
|
|
157
|
-
|
|
153
|
+
# 处理音标问题
|
|
158
154
|
s1 = get_diacritic_variant(s1)
|
|
159
155
|
s2 = get_diacritic_variant(s2)
|
|
160
156
|
# 这里提出来是为了少计算 但后期需要平衡内存和算力
|
|
157
|
+
# 移除指定符号 这里做了小写化处理
|
|
161
158
|
s1_punc = remove_punctuation(s1)
|
|
162
159
|
s2_punc = remove_punctuation(s2)
|
|
160
|
+
# 分成列表
|
|
163
161
|
s1_punc_split = s1_punc.split()
|
|
164
162
|
s2_punc_split = s2_punc.split()
|
|
165
163
|
|
|
@@ -235,6 +233,11 @@ def AuthorRatio(
|
|
|
235
233
|
# 如果循环结束都没有提前返回 False,则表示两个字符串完全匹配,返回 True
|
|
236
234
|
return True
|
|
237
235
|
|
|
236
|
+
# 防止清理后 一方变为空字符串
|
|
237
|
+
if len(l1) == 0 or len(l2) == 0:
|
|
238
|
+
return 0
|
|
239
|
+
|
|
240
|
+
# 这里的逻辑是最后的位置全大写就将他拆分散 比如 joi CJ -> joi C J
|
|
238
241
|
if len(l1[-1]) != 1 and l1[-1].isupper():
|
|
239
242
|
t_str = l1[-1]
|
|
240
243
|
l1 = l1[:-1]
|
|
@@ -248,6 +251,16 @@ def AuthorRatio(
|
|
|
248
251
|
if len(l1) == len(l2) and (is_same_or_initials_match(l1, l2) or set(l1) == set(l2)):
|
|
249
252
|
return 1
|
|
250
253
|
|
|
254
|
+
# 在这里针对上面一条算法再增加一条算法,先对list 排序在对他进行上面的对比
|
|
255
|
+
# 如果长度相等 简写也是单词的首字母 那么两个名字一致 举例:Guo, Qiang @@ Q. Guo
|
|
256
|
+
sort_l1 = copy.deepcopy(l1)
|
|
257
|
+
sort_l2 = copy.deepcopy(l2)
|
|
258
|
+
sort_l1.sort()
|
|
259
|
+
sort_l2.sort()
|
|
260
|
+
if len(sort_l1) == len(sort_l2) and (is_same_or_initials_match(sort_l1, sort_l2) or set(sort_l1) == set(sort_l2)):
|
|
261
|
+
return 0.99
|
|
262
|
+
|
|
263
|
+
|
|
251
264
|
##############################################################
|
|
252
265
|
# 以上为情况穷举情况,以下为其他情况的相似率计算
|
|
253
266
|
##############################################################
|
|
@@ -262,7 +275,7 @@ def AuthorRatio(
|
|
|
262
275
|
len_ratio = len1 / len2 if len1 > len2 else len2 / len1
|
|
263
276
|
|
|
264
277
|
# 计算归一化的 Indel 相似度。 对于比率<score_cutoff,返回0。
|
|
265
|
-
end_ratio = normal_end_ratio = Jaro.normalized_similarity(s1, s2)
|
|
278
|
+
end_ratio = normal_end_ratio = Jaro.normalized_similarity(s1.lower(), s2.lower())
|
|
266
279
|
|
|
267
280
|
# 需要对作者的比率分布进行调研决定哪些是小比率哪些是大比率
|
|
268
281
|
if len_ratio > 1.5 and len_ratio < 3:
|
|
@@ -287,7 +300,7 @@ def AuthorRatio(
|
|
|
287
300
|
|
|
288
301
|
# 首字母相同提分
|
|
289
302
|
# if is_contained(extract_initials(s1), extract_initials(s2)):
|
|
290
|
-
if is_contained_list([i[:1] for i in l1], [i[:1] for i in l2]):
|
|
303
|
+
if is_contained_list([i[:1].lower() for i in l1], [i[:1].lower() for i in l2]):
|
|
291
304
|
# 应该提分
|
|
292
305
|
end_ratio = end_ratio * 1.05
|
|
293
306
|
else:
|
|
@@ -302,7 +315,7 @@ def AuthorRatio(
|
|
|
302
315
|
end_ratio = end_ratio * 1.1
|
|
303
316
|
|
|
304
317
|
if l1[0] != l2[0]:
|
|
305
|
-
end_ratio = end_ratio * Jaro.normalized_similarity(l1[0], l2[0])
|
|
318
|
+
end_ratio = end_ratio * Jaro.normalized_similarity(l1[0].lower(), l2[0].lower())
|
|
306
319
|
|
|
307
320
|
# 如果字符串本身的相似度高 应该拉上去 否者应该拉下来
|
|
308
321
|
return min(end_ratio, 1) * 0.5 + normal_end_ratio * 0.5
|
|
@@ -7,3 +7,20 @@ def is_all_english_chars(s):
|
|
|
7
7
|
|
|
8
8
|
def contains_chinese_chars(s):
|
|
9
9
|
return bool(re.search(r'[\u3400-\u9fff]', s))
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def is_empty(value):
|
|
13
|
+
# 如果是 None,直接返回 True
|
|
14
|
+
if value is None:
|
|
15
|
+
return True
|
|
16
|
+
|
|
17
|
+
# 如果是字符串,检查去除空白后是否为空
|
|
18
|
+
if isinstance(value, str):
|
|
19
|
+
return value.strip() == ""
|
|
20
|
+
|
|
21
|
+
# 可选:处理其他可迭代类型(如列表、字典等)
|
|
22
|
+
if hasattr(value, "__len__"):
|
|
23
|
+
return len(value) == 0
|
|
24
|
+
|
|
25
|
+
# 默认情况下,非 None、非空类型返回 False
|
|
26
|
+
return False
|
|
@@ -66,6 +66,11 @@ class StringClear(object):
|
|
|
66
66
|
self.obj_str = re.sub("[_]", "", self.obj_str)
|
|
67
67
|
return self
|
|
68
68
|
|
|
69
|
+
def replace_dash_with_space(self):
|
|
70
|
+
# 横线换成空格 比 去除符号有时更有用
|
|
71
|
+
self.obj_str = self.obj_str.replace("-", " ")
|
|
72
|
+
return self
|
|
73
|
+
|
|
69
74
|
def remove_diacritics(self):
|
|
70
75
|
# 去除音标 转换成字母
|
|
71
76
|
self.obj_str = get_diacritic_variant(self.obj_str)
|
|
@@ -81,6 +86,26 @@ class StringClear(object):
|
|
|
81
86
|
self.obj_str = re.sub("\\(.*?\\)", "", self.obj_str)
|
|
82
87
|
return self
|
|
83
88
|
|
|
89
|
+
def remove_html_tag(self):
|
|
90
|
+
import html
|
|
91
|
+
from parsel import Selector
|
|
92
|
+
|
|
93
|
+
self.obj_str = html.unescape(self.obj_str)
|
|
94
|
+
|
|
95
|
+
def clean_html(html):
|
|
96
|
+
sel = Selector(text=html)
|
|
97
|
+
# 移除脚本和样式
|
|
98
|
+
sel.xpath('//script').remove()
|
|
99
|
+
sel.xpath('//style').remove()
|
|
100
|
+
|
|
101
|
+
# 提取文本
|
|
102
|
+
text_nodes = sel.xpath('//text()').getall()
|
|
103
|
+
return ''.join(t.strip() for t in text_nodes if t.strip())
|
|
104
|
+
|
|
105
|
+
self.obj_str = clean_html(self.obj_str)
|
|
106
|
+
|
|
107
|
+
return self
|
|
108
|
+
|
|
84
109
|
def get_str(self):
|
|
85
110
|
return self.obj_str
|
|
86
111
|
|
|
@@ -89,8 +114,10 @@ def rel_clear(str_obj):
|
|
|
89
114
|
# 为融合数据定制的 清理规则
|
|
90
115
|
return (StringClear(str_obj)
|
|
91
116
|
.None_to_str() # 空对象转str 防止空对象
|
|
92
|
-
.to_str() # 防止其他类型传入
|
|
117
|
+
.to_str() # 防止其他类型传入 比如 int double
|
|
93
118
|
.qj_to_bj() # 全角转半角
|
|
119
|
+
.remove_html_tag() # html标签清理
|
|
120
|
+
.replace_dash_with_space() # 横线转空格 在 英文 title 中更有用
|
|
94
121
|
.remove_special_chars() # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
|
|
95
122
|
.collapse_spaces() # 移除多余空格,连续多个空格变一个
|
|
96
123
|
.lower() # 小写
|
|
@@ -171,15 +171,15 @@ re_common/v2/baselibrary/tools/list_tools.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
|
|
|
171
171
|
re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=o-PNjmcYDkfyiR75Jci_9sSn4cGi_F9jPCIrwYdnb1U,1013
|
|
172
172
|
re_common/v2/baselibrary/tools/text_matcher.py,sha256=F4WtLO-b7H6V9TIvOntCD9ZXSQP_KijPuLLYcLPtrKQ,7021
|
|
173
173
|
re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
|
|
174
|
-
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=
|
|
174
|
+
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=tzjVr_-6iPAKTt14hR-BhRshdRgeT_MPJpUQkxcTXns,4084
|
|
175
175
|
re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
176
|
-
re_common/v2/baselibrary/utils/author_smi.py,sha256=
|
|
176
|
+
re_common/v2/baselibrary/utils/author_smi.py,sha256=Mjl0GYH9e0TP48yxnxC7qgMP2bZW04pa8TQezpKo9L0,11796
|
|
177
177
|
re_common/v2/baselibrary/utils/basedict.py,sha256=tSV85pARe8ZQDY77_h_heS81EWwcgJW076DcA9WQyjY,1161
|
|
178
178
|
re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
|
|
179
179
|
re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
|
|
180
|
-
re_common/v2/baselibrary/utils/string_bool.py,sha256=
|
|
181
|
-
re_common/v2/baselibrary/utils/string_clear.py,sha256=
|
|
182
|
-
re_common/v2/baselibrary/utils/stringutils.py,sha256=
|
|
180
|
+
re_common/v2/baselibrary/utils/string_bool.py,sha256=4VCr1g8pX5YnzZSKctQgQfmhSQ0aw7a8ruhWdiRmBFU,641
|
|
181
|
+
re_common/v2/baselibrary/utils/string_clear.py,sha256=R3Asus3NcmL-4SVLsfhYmP7YQwB-H7iCCFPnl9eKO7A,4157
|
|
182
|
+
re_common/v2/baselibrary/utils/stringutils.py,sha256=quAgCdW_ayQwY4AqnZZkZ4NlcSEcy6f1arOVSeP2vEo,2699
|
|
183
183
|
re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
184
184
|
re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
|
|
185
185
|
re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
|
|
@@ -206,8 +206,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
|
|
|
206
206
|
re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
|
|
207
207
|
re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
|
|
208
208
|
re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
|
|
209
|
-
re_common-10.0.
|
|
210
|
-
re_common-10.0.
|
|
211
|
-
re_common-10.0.
|
|
212
|
-
re_common-10.0.
|
|
213
|
-
re_common-10.0.
|
|
209
|
+
re_common-10.0.2.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
210
|
+
re_common-10.0.2.dist-info/METADATA,sha256=oVKxavSnd8Vne03NpymV_GLIR6DXI7UfDBBBj2CgVbc,581
|
|
211
|
+
re_common-10.0.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
212
|
+
re_common-10.0.2.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
|
|
213
|
+
re_common-10.0.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|