re-common 10.0.11__py3-none-any.whl → 10.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/v2/baselibrary/tools/list_tools.py +9 -0
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +59 -0
- re_common/v2/baselibrary/utils/basedict.py +13 -1
- re_common/v2/baselibrary/utils/string_smi.py +18 -0
- re_common/v2/baselibrary/utils/stringutils.py +58 -0
- {re_common-10.0.11.dist-info → re_common-10.0.12.dist-info}/METADATA +1 -1
- {re_common-10.0.11.dist-info → re_common-10.0.12.dist-info}/RECORD +10 -9
- {re_common-10.0.11.dist-info → re_common-10.0.12.dist-info}/LICENSE +0 -0
- {re_common-10.0.11.dist-info → re_common-10.0.12.dist-info}/WHEEL +0 -0
- {re_common-10.0.11.dist-info → re_common-10.0.12.dist-info}/top_level.txt +0 -0
|
@@ -114,3 +114,62 @@ def get_wos_author_abbr(author_row: str):
|
|
|
114
114
|
abbr_list = [author.strip() for author in abbr_list if
|
|
115
115
|
author.strip() and author.strip().lower() not in ("*", "and")]
|
|
116
116
|
return ";".join(abbr_list)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def deal_rel_vol(vol_str: str):
|
|
120
|
+
"""
|
|
121
|
+
处理 期刊融合时的卷处理逻辑
|
|
122
|
+
"""
|
|
123
|
+
if vol_str.replace(".", "").isdigit():
|
|
124
|
+
try:
|
|
125
|
+
float_num = float(vol_str)
|
|
126
|
+
if int(float_num) == float_num:
|
|
127
|
+
return str(int(float_num))
|
|
128
|
+
except:
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
if vol_str.lower().startswith("v "):
|
|
132
|
+
vol_str = vol_str.lower().replace("v ", "").strip()
|
|
133
|
+
return vol_str
|
|
134
|
+
if vol_str.lower().startswith("volume "):
|
|
135
|
+
vol_str = vol_str.lower().replace("volume ", "").strip()
|
|
136
|
+
return vol_str
|
|
137
|
+
if vol_str.lower().startswith("vol. "):
|
|
138
|
+
vol_str = vol_str.lower().replace("vol. ", "").strip()
|
|
139
|
+
return vol_str
|
|
140
|
+
if vol_str.lower().startswith("vol "):
|
|
141
|
+
vol_str = vol_str.lower().replace("vol ", "").strip()
|
|
142
|
+
return vol_str
|
|
143
|
+
return vol_str
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def deal_num_strs(input_str):
|
|
147
|
+
"""
|
|
148
|
+
int后在str 防止有浮点型的表达方式
|
|
149
|
+
"""
|
|
150
|
+
number_list = re.findall(r'\d+', input_str)
|
|
151
|
+
transformed_numbers = [str(int(num)) for num in number_list]
|
|
152
|
+
|
|
153
|
+
# 替换原字符串中的数字为转换后的数字
|
|
154
|
+
for num, transformed_num in zip(number_list, transformed_numbers):
|
|
155
|
+
input_str = input_str.replace(num, transformed_num)
|
|
156
|
+
return input_str
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def deal_num(strs):
|
|
160
|
+
"""
|
|
161
|
+
将 期格式化 方便 group尤其是有横杆的数据
|
|
162
|
+
该方法 为融合二次分割时使用,如果场景合适也可以用于其他地方
|
|
163
|
+
:param strs:
|
|
164
|
+
:return:
|
|
165
|
+
"""
|
|
166
|
+
strs = strs.replace("-", "_").replace(".", "_").upper()
|
|
167
|
+
if strs.find("_") > -1:
|
|
168
|
+
start, end = strs.split("_")
|
|
169
|
+
start = deal_num_strs(start)
|
|
170
|
+
end = deal_num_strs(end)
|
|
171
|
+
strs = start + "_" + end
|
|
172
|
+
else:
|
|
173
|
+
strs = deal_num_strs(strs)
|
|
174
|
+
|
|
175
|
+
return strs
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from itertools import groupby
|
|
2
3
|
|
|
3
4
|
logger = logging.getLogger(__name__) # 创建 logger 实例
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class BaseDict(object):
|
|
7
8
|
@classmethod
|
|
8
|
-
def flip_dict(cls, original_dict, raise_on_conflict=
|
|
9
|
+
def flip_dict(cls, original_dict, raise_on_conflict=True):
|
|
9
10
|
"""
|
|
10
11
|
翻转字典:将 key 是字符串、value 是列表的字典,转换为 key 是原 value 列表中的元素、value 是原 key 的字典。
|
|
11
12
|
:param original_dict: 原始字典
|
|
@@ -24,3 +25,14 @@ class BaseDict(object):
|
|
|
24
25
|
f"Warning: Key conflict detected for {value}. Overwriting with new value: {key}.")
|
|
25
26
|
flipped_dict[value] = key
|
|
26
27
|
return flipped_dict
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def get_temp_gid_dicts(cls,lists,key_name):
|
|
31
|
+
"""
|
|
32
|
+
对 列表字典 分组 组成 分组id的字典
|
|
33
|
+
"""
|
|
34
|
+
dicts = {}
|
|
35
|
+
for group_id, group_tmp in groupby(sorted(lists, key=lambda x: x[key_name]),
|
|
36
|
+
key=lambda x: x[key_name]):
|
|
37
|
+
dicts[group_id] = group_tmp
|
|
38
|
+
return dicts
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import jellyfish
|
|
2
|
+
from rapidfuzz.distance import DamerauLevenshtein
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class JaroDamerauLevenshteinMaxSim(object):
|
|
6
|
+
"""
|
|
7
|
+
jaro_similarity 有缺陷 以下样例数据会导致分很低
|
|
8
|
+
s1 = "in situ monitoring of semiconductor wafer temperature using infrared interfe rometry"
|
|
9
|
+
s2 = "insitu monitoring of semiconductor wafer temperature using infrared interferometry"
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def get_sim(self, str1: str, str2: str) -> float:
|
|
13
|
+
similarity1 = jellyfish.jaro_similarity(str1, str2)
|
|
14
|
+
if str1.strip() == "" and str2.strip() == "":
|
|
15
|
+
similarity2 = 0
|
|
16
|
+
else:
|
|
17
|
+
similarity2 = 1 - DamerauLevenshtein.normalized_distance(str1, str2)
|
|
18
|
+
return max(similarity1, similarity2)
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import threading
|
|
3
|
+
from itertools import combinations
|
|
3
4
|
|
|
4
5
|
import regex
|
|
5
6
|
import unicodedata
|
|
6
7
|
from html.parser import HTMLParser
|
|
7
8
|
|
|
9
|
+
from re_common.v2.baselibrary.utils.string_clear import rel_clear
|
|
10
|
+
from re_common.v2.baselibrary.utils.string_smi import JaroDamerauLevenshteinMaxSim
|
|
11
|
+
|
|
8
12
|
|
|
9
13
|
def bj2qj(src):
|
|
10
14
|
if src is None:
|
|
@@ -152,3 +156,57 @@ def remove_spaces_between_chinese_characters(text):
|
|
|
152
156
|
"""
|
|
153
157
|
pattern = r'(?<=[\u3400-\u9fff])\s+(?=[\u3400-\u9fff])'
|
|
154
158
|
return re.sub(pattern, '', text)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
sim_utils = JaroDamerauLevenshteinMaxSim()
|
|
163
|
+
|
|
164
|
+
def group_similar_texts(texts, threshold=0.9):
|
|
165
|
+
"""根据相似度对文本进行分组"""
|
|
166
|
+
n = len(texts)
|
|
167
|
+
# 创建邻接表表示图
|
|
168
|
+
graph = [[] for _ in range(n)]
|
|
169
|
+
# 计算所有文本对的相似度并构建图
|
|
170
|
+
for i, j in combinations(range(n), 2):
|
|
171
|
+
similarity = sim_utils.get_sim(rel_clear(texts[i]), rel_clear(texts[j]))
|
|
172
|
+
if similarity >= threshold:
|
|
173
|
+
graph[i].append(j)
|
|
174
|
+
graph[j].append(i)
|
|
175
|
+
|
|
176
|
+
visited = [False] * n
|
|
177
|
+
groups = []
|
|
178
|
+
|
|
179
|
+
# 使用DFS找到连通分量
|
|
180
|
+
def dfs(node, group):
|
|
181
|
+
visited[node] = True
|
|
182
|
+
group.append(node)
|
|
183
|
+
for neighbor in graph[node]:
|
|
184
|
+
if not visited[neighbor]:
|
|
185
|
+
dfs(neighbor, group)
|
|
186
|
+
|
|
187
|
+
# 找到所有连通分量
|
|
188
|
+
for i in range(n):
|
|
189
|
+
if not visited[i]:
|
|
190
|
+
current_group = []
|
|
191
|
+
dfs(i, current_group)
|
|
192
|
+
groups.append(current_group)
|
|
193
|
+
|
|
194
|
+
return groups
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def get_group_abstract(lists):
|
|
198
|
+
"""
|
|
199
|
+
这是一个 分组程序 ,会根据简单的连通图分组
|
|
200
|
+
lists: [(id,txt),...]
|
|
201
|
+
return: all_list 返回一个二维列表 每个列表里面是id 每个列表为一个分组
|
|
202
|
+
"""
|
|
203
|
+
abstract_list = [i[1] for i in lists]
|
|
204
|
+
keyid_list = [i[0] for i in lists]
|
|
205
|
+
groups = group_similar_texts(abstract_list, threshold=0.9)
|
|
206
|
+
all_list = []
|
|
207
|
+
for group in groups:
|
|
208
|
+
t_list = []
|
|
209
|
+
for text_idx in group:
|
|
210
|
+
t_list.append(keyid_list[text_idx])
|
|
211
|
+
all_list.append(t_list)
|
|
212
|
+
return all_list
|
|
@@ -170,20 +170,21 @@ re_common/v2/baselibrary/tools/WeChatRobot.py,sha256=EaQgNncROAhU5-psYRGWAshIV5a
|
|
|
170
170
|
re_common/v2/baselibrary/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
171
171
|
re_common/v2/baselibrary/tools/dict_tools.py,sha256=BTh7oJuJ619IZgxiYlim0ltrXBclDtb7WzyFGr7wVf0,1246
|
|
172
172
|
re_common/v2/baselibrary/tools/dolphinscheduler.py,sha256=1m7UGYDiuvJUCI6ik6CGM2fO8U5XteJzn55VRbwB9ts,7978
|
|
173
|
-
re_common/v2/baselibrary/tools/list_tools.py,sha256=
|
|
173
|
+
re_common/v2/baselibrary/tools/list_tools.py,sha256=qYxdLccRbrULOBbaPdJ_MyFFmVJGVMdW5E36nJ3ejr8,249
|
|
174
174
|
re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=d_h9j7VxiXpcn1GHZ7L2tpx9_LDQshcl58tlKvSxZPg,1691
|
|
175
175
|
re_common/v2/baselibrary/tools/text_matcher.py,sha256=F4WtLO-b7H6V9TIvOntCD9ZXSQP_KijPuLLYcLPtrKQ,7021
|
|
176
176
|
re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
|
|
177
|
-
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=
|
|
177
|
+
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=__9MECbdrMnYc-ksYn2liM8vEbqF9uR4hZKqw86kW1Q,5924
|
|
178
178
|
re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
179
179
|
re_common/v2/baselibrary/utils/author_smi.py,sha256=1ebH3AHv19jtJWdlqNdwu6t58HNVLCotuCB6ff1SWiw,13666
|
|
180
|
-
re_common/v2/baselibrary/utils/basedict.py,sha256=
|
|
180
|
+
re_common/v2/baselibrary/utils/basedict.py,sha256=sH3_RZ8u4649-jX2V1uKNNkjJVUijZBDp6SdqncOZ88,1583
|
|
181
181
|
re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
|
|
182
182
|
re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
|
|
183
183
|
re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
|
|
184
184
|
re_common/v2/baselibrary/utils/string_bool.py,sha256=EJnkSck4ofcIeJ6nLzAOVtlt6o1WBgvgVwIqJKj5Suc,2993
|
|
185
185
|
re_common/v2/baselibrary/utils/string_clear.py,sha256=pGxL9PlzQDM06sC0j6U0zYRemvsJ7-OOpfzS5ETCxAs,6258
|
|
186
|
-
re_common/v2/baselibrary/utils/
|
|
186
|
+
re_common/v2/baselibrary/utils/string_smi.py,sha256=cU0WAWHRGnGoVQx3eCEKeM_q_olFNzRTJe7rSe586SY,741
|
|
187
|
+
re_common/v2/baselibrary/utils/stringutils.py,sha256=ISheMydPZeNmqsffGDT4Ut_UGpK3r6k8STR78Ere8Wg,6033
|
|
187
188
|
re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
188
189
|
re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
|
|
189
190
|
re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
|
|
@@ -210,8 +211,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
|
|
|
210
211
|
re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
|
|
211
212
|
re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
|
|
212
213
|
re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
|
|
213
|
-
re_common-10.0.
|
|
214
|
-
re_common-10.0.
|
|
215
|
-
re_common-10.0.
|
|
216
|
-
re_common-10.0.
|
|
217
|
-
re_common-10.0.
|
|
214
|
+
re_common-10.0.12.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
215
|
+
re_common-10.0.12.dist-info/METADATA,sha256=N9MX7TnI7lhFQyhFaV0n0wr5XWT5prKsFX9gsM-X4T4,582
|
|
216
|
+
re_common-10.0.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
217
|
+
re_common-10.0.12.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
|
|
218
|
+
re_common-10.0.12.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|