re-common 10.0.0__py3-none-any.whl → 10.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/v2/baselibrary/utils/author_smi.py +14 -3
- re_common/v2/baselibrary/utils/stringutils.py +1 -0
- {re_common-10.0.0.dist-info → re_common-10.0.1.dist-info}/METADATA +1 -1
- {re_common-10.0.0.dist-info → re_common-10.0.1.dist-info}/RECORD +7 -7
- {re_common-10.0.0.dist-info → re_common-10.0.1.dist-info}/LICENSE +0 -0
- {re_common-10.0.0.dist-info → re_common-10.0.1.dist-info}/WHEEL +0 -0
- {re_common-10.0.0.dist-info → re_common-10.0.1.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import copy
|
|
1
2
|
import re
|
|
2
3
|
import string
|
|
3
4
|
|
|
@@ -248,6 +249,16 @@ def AuthorRatio(
|
|
|
248
249
|
if len(l1) == len(l2) and (is_same_or_initials_match(l1, l2) or set(l1) == set(l2)):
|
|
249
250
|
return 1
|
|
250
251
|
|
|
252
|
+
# 在这里针对上面一条算法再增加一条算法,先对list 排序在对他进行上面的对比
|
|
253
|
+
# 如果长度相等 简写也是单词的首字母 那么两个名字一致 举例:Guo, Qiang @@ Q. Guo
|
|
254
|
+
sort_l1 = copy.deepcopy(l1)
|
|
255
|
+
sort_l2 = copy.deepcopy(l2)
|
|
256
|
+
sort_l1.sort()
|
|
257
|
+
sort_l2.sort()
|
|
258
|
+
if len(sort_l1) == len(sort_l2) and (is_same_or_initials_match(sort_l1, sort_l2) or set(sort_l1) == set(sort_l2)):
|
|
259
|
+
return 0.99
|
|
260
|
+
|
|
261
|
+
|
|
251
262
|
##############################################################
|
|
252
263
|
# 以上为情况穷举情况,以下为其他情况的相似率计算
|
|
253
264
|
##############################################################
|
|
@@ -262,7 +273,7 @@ def AuthorRatio(
|
|
|
262
273
|
len_ratio = len1 / len2 if len1 > len2 else len2 / len1
|
|
263
274
|
|
|
264
275
|
# 计算归一化的 Indel 相似度。 对于比率<score_cutoff,返回0。
|
|
265
|
-
end_ratio = normal_end_ratio = Jaro.normalized_similarity(s1, s2)
|
|
276
|
+
end_ratio = normal_end_ratio = Jaro.normalized_similarity(s1.lower(), s2.lower())
|
|
266
277
|
|
|
267
278
|
# 需要对作者的比率分布进行调研决定哪些是小比率哪些是大比率
|
|
268
279
|
if len_ratio > 1.5 and len_ratio < 3:
|
|
@@ -287,7 +298,7 @@ def AuthorRatio(
|
|
|
287
298
|
|
|
288
299
|
# 首字母相同提分
|
|
289
300
|
# if is_contained(extract_initials(s1), extract_initials(s2)):
|
|
290
|
-
if is_contained_list([i[:1] for i in l1], [i[:1] for i in l2]):
|
|
301
|
+
if is_contained_list([i[:1].lower() for i in l1], [i[:1].lower() for i in l2]):
|
|
291
302
|
# 应该提分
|
|
292
303
|
end_ratio = end_ratio * 1.05
|
|
293
304
|
else:
|
|
@@ -302,7 +313,7 @@ def AuthorRatio(
|
|
|
302
313
|
end_ratio = end_ratio * 1.1
|
|
303
314
|
|
|
304
315
|
if l1[0] != l2[0]:
|
|
305
|
-
end_ratio = end_ratio * Jaro.normalized_similarity(l1[0], l2[0])
|
|
316
|
+
end_ratio = end_ratio * Jaro.normalized_similarity(l1[0].lower(), l2[0].lower())
|
|
306
317
|
|
|
307
318
|
# 如果字符串本身的相似度高 应该拉上去 否者应该拉下来
|
|
308
319
|
return min(end_ratio, 1) * 0.5 + normal_end_ratio * 0.5
|
|
@@ -173,13 +173,13 @@ re_common/v2/baselibrary/tools/text_matcher.py,sha256=F4WtLO-b7H6V9TIvOntCD9ZXSQ
|
|
|
173
173
|
re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
|
|
174
174
|
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=dxrWO800wElZM_4aKolUHSPBYZlxqzXukE4M-LZ13jA,2644
|
|
175
175
|
re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
176
|
-
re_common/v2/baselibrary/utils/author_smi.py,sha256=
|
|
176
|
+
re_common/v2/baselibrary/utils/author_smi.py,sha256=wkuoGEBNM28k8D1E83vBxJD5N4xgzr6aAQFMVPJ2tnc,11585
|
|
177
177
|
re_common/v2/baselibrary/utils/basedict.py,sha256=tSV85pARe8ZQDY77_h_heS81EWwcgJW076DcA9WQyjY,1161
|
|
178
178
|
re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
|
|
179
179
|
re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
|
|
180
180
|
re_common/v2/baselibrary/utils/string_bool.py,sha256=f5qYdKvTufxmfSsxXN41WFLV--vCwDWU2LeQPbDvKZY,178
|
|
181
181
|
re_common/v2/baselibrary/utils/string_clear.py,sha256=LqGvv-UZnsVwiDBN3-PdzDUTfWlAsKsvKlkXqySI0eE,3244
|
|
182
|
-
re_common/v2/baselibrary/utils/stringutils.py,sha256=
|
|
182
|
+
re_common/v2/baselibrary/utils/stringutils.py,sha256=quAgCdW_ayQwY4AqnZZkZ4NlcSEcy6f1arOVSeP2vEo,2699
|
|
183
183
|
re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
184
184
|
re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
|
|
185
185
|
re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
|
|
@@ -206,8 +206,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
|
|
|
206
206
|
re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
|
|
207
207
|
re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
|
|
208
208
|
re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
|
|
209
|
-
re_common-10.0.
|
|
210
|
-
re_common-10.0.
|
|
211
|
-
re_common-10.0.
|
|
212
|
-
re_common-10.0.
|
|
213
|
-
re_common-10.0.
|
|
209
|
+
re_common-10.0.1.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
210
|
+
re_common-10.0.1.dist-info/METADATA,sha256=xIF1hPdvDgN_bQ3YpyAG3_tjxGOIVQvNUM5NraOe73o,581
|
|
211
|
+
re_common-10.0.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
212
|
+
re_common-10.0.1.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
|
|
213
|
+
re_common-10.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|