re-common 10.0.10__py3-none-any.whl → 10.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/v2/baselibrary/tools/list_tools.py +9 -0
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +59 -0
- re_common/v2/baselibrary/utils/basedict.py +13 -1
- re_common/v2/baselibrary/utils/string_clear.py +9 -1
- re_common/v2/baselibrary/utils/string_smi.py +18 -0
- re_common/v2/baselibrary/utils/stringutils.py +66 -0
- {re_common-10.0.10.dist-info → re_common-10.0.12.dist-info}/METADATA +1 -1
- {re_common-10.0.10.dist-info → re_common-10.0.12.dist-info}/RECORD +11 -10
- {re_common-10.0.10.dist-info → re_common-10.0.12.dist-info}/LICENSE +0 -0
- {re_common-10.0.10.dist-info → re_common-10.0.12.dist-info}/WHEEL +0 -0
- {re_common-10.0.10.dist-info → re_common-10.0.12.dist-info}/top_level.txt +0 -0
|
@@ -114,3 +114,62 @@ def get_wos_author_abbr(author_row: str):
|
|
|
114
114
|
abbr_list = [author.strip() for author in abbr_list if
|
|
115
115
|
author.strip() and author.strip().lower() not in ("*", "and")]
|
|
116
116
|
return ";".join(abbr_list)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def deal_rel_vol(vol_str: str):
|
|
120
|
+
"""
|
|
121
|
+
处理 期刊融合时的卷处理逻辑
|
|
122
|
+
"""
|
|
123
|
+
if vol_str.replace(".", "").isdigit():
|
|
124
|
+
try:
|
|
125
|
+
float_num = float(vol_str)
|
|
126
|
+
if int(float_num) == float_num:
|
|
127
|
+
return str(int(float_num))
|
|
128
|
+
except:
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
if vol_str.lower().startswith("v "):
|
|
132
|
+
vol_str = vol_str.lower().replace("v ", "").strip()
|
|
133
|
+
return vol_str
|
|
134
|
+
if vol_str.lower().startswith("volume "):
|
|
135
|
+
vol_str = vol_str.lower().replace("volume ", "").strip()
|
|
136
|
+
return vol_str
|
|
137
|
+
if vol_str.lower().startswith("vol. "):
|
|
138
|
+
vol_str = vol_str.lower().replace("vol. ", "").strip()
|
|
139
|
+
return vol_str
|
|
140
|
+
if vol_str.lower().startswith("vol "):
|
|
141
|
+
vol_str = vol_str.lower().replace("vol ", "").strip()
|
|
142
|
+
return vol_str
|
|
143
|
+
return vol_str
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def deal_num_strs(input_str):
|
|
147
|
+
"""
|
|
148
|
+
int后在str 防止有浮点型的表达方式
|
|
149
|
+
"""
|
|
150
|
+
number_list = re.findall(r'\d+', input_str)
|
|
151
|
+
transformed_numbers = [str(int(num)) for num in number_list]
|
|
152
|
+
|
|
153
|
+
# 替换原字符串中的数字为转换后的数字
|
|
154
|
+
for num, transformed_num in zip(number_list, transformed_numbers):
|
|
155
|
+
input_str = input_str.replace(num, transformed_num)
|
|
156
|
+
return input_str
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def deal_num(strs):
|
|
160
|
+
"""
|
|
161
|
+
将 期格式化 方便 group尤其是有横杆的数据
|
|
162
|
+
该方法 为融合二次分割时使用,如果场景合适也可以用于其他地方
|
|
163
|
+
:param strs:
|
|
164
|
+
:return:
|
|
165
|
+
"""
|
|
166
|
+
strs = strs.replace("-", "_").replace(".", "_").upper()
|
|
167
|
+
if strs.find("_") > -1:
|
|
168
|
+
start, end = strs.split("_")
|
|
169
|
+
start = deal_num_strs(start)
|
|
170
|
+
end = deal_num_strs(end)
|
|
171
|
+
strs = start + "_" + end
|
|
172
|
+
else:
|
|
173
|
+
strs = deal_num_strs(strs)
|
|
174
|
+
|
|
175
|
+
return strs
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from itertools import groupby
|
|
2
3
|
|
|
3
4
|
logger = logging.getLogger(__name__) # 创建 logger 实例
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class BaseDict(object):
|
|
7
8
|
@classmethod
|
|
8
|
-
def flip_dict(cls, original_dict, raise_on_conflict=
|
|
9
|
+
def flip_dict(cls, original_dict, raise_on_conflict=True):
|
|
9
10
|
"""
|
|
10
11
|
翻转字典:将 key 是字符串、value 是列表的字典,转换为 key 是原 value 列表中的元素、value 是原 key 的字典。
|
|
11
12
|
:param original_dict: 原始字典
|
|
@@ -24,3 +25,14 @@ class BaseDict(object):
|
|
|
24
25
|
f"Warning: Key conflict detected for {value}. Overwriting with new value: {key}.")
|
|
25
26
|
flipped_dict[value] = key
|
|
26
27
|
return flipped_dict
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def get_temp_gid_dicts(cls,lists,key_name):
|
|
31
|
+
"""
|
|
32
|
+
对 列表字典 分组 组成 分组id的字典
|
|
33
|
+
"""
|
|
34
|
+
dicts = {}
|
|
35
|
+
for group_id, group_tmp in groupby(sorted(lists, key=lambda x: x[key_name]),
|
|
36
|
+
key=lambda x: x[key_name]):
|
|
37
|
+
dicts[group_id] = group_tmp
|
|
38
|
+
return dicts
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import regex
|
|
3
3
|
|
|
4
|
-
from re_common.v2.baselibrary.utils.stringutils import qj2bj, bj2qj, get_diacritic_variant, clean_html
|
|
4
|
+
from re_common.v2.baselibrary.utils.stringutils import qj2bj, bj2qj, get_diacritic_variant, clean_html, \
|
|
5
|
+
remove_spaces_between_chinese_characters
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class StringClear(object):
|
|
@@ -101,6 +102,7 @@ class StringClear(object):
|
|
|
101
102
|
return self
|
|
102
103
|
|
|
103
104
|
def remove_html_tag(self):
|
|
105
|
+
# 去除 html 标签
|
|
104
106
|
import html
|
|
105
107
|
|
|
106
108
|
self.obj_str = html.unescape(self.obj_str)
|
|
@@ -109,6 +111,11 @@ class StringClear(object):
|
|
|
109
111
|
|
|
110
112
|
return self
|
|
111
113
|
|
|
114
|
+
def remove_spaces_in_chinese_characters(self):
|
|
115
|
+
# 匹配中文间的空格并替换为空字符串
|
|
116
|
+
self.obj_str = remove_spaces_between_chinese_characters(self.obj_str)
|
|
117
|
+
return self
|
|
118
|
+
|
|
112
119
|
def get_str(self):
|
|
113
120
|
return self.obj_str
|
|
114
121
|
|
|
@@ -122,6 +129,7 @@ def rel_clear(str_obj):
|
|
|
122
129
|
.remove_html_tag() # html标签清理
|
|
123
130
|
.remove_special_chars() # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
|
|
124
131
|
.collapse_spaces() # 移除多余空格,连续多个空格变一个
|
|
132
|
+
.remove_spaces_in_chinese_characters() # 匹配中文间的空格并替换为空字符串
|
|
125
133
|
.lower() # 小写
|
|
126
134
|
.get_str() # 获取str
|
|
127
135
|
.strip()) # 去掉空格
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import jellyfish
|
|
2
|
+
from rapidfuzz.distance import DamerauLevenshtein
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class JaroDamerauLevenshteinMaxSim(object):
|
|
6
|
+
"""
|
|
7
|
+
jaro_similarity 有缺陷 以下样例数据会导致分很低
|
|
8
|
+
s1 = "in situ monitoring of semiconductor wafer temperature using infrared interfe rometry"
|
|
9
|
+
s2 = "insitu monitoring of semiconductor wafer temperature using infrared interferometry"
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def get_sim(self, str1: str, str2: str) -> float:
|
|
13
|
+
similarity1 = jellyfish.jaro_similarity(str1, str2)
|
|
14
|
+
if str1.strip() == "" and str2.strip() == "":
|
|
15
|
+
similarity2 = 0
|
|
16
|
+
else:
|
|
17
|
+
similarity2 = 1 - DamerauLevenshtein.normalized_distance(str1, str2)
|
|
18
|
+
return max(similarity1, similarity2)
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import threading
|
|
3
|
+
from itertools import combinations
|
|
3
4
|
|
|
4
5
|
import regex
|
|
5
6
|
import unicodedata
|
|
6
7
|
from html.parser import HTMLParser
|
|
7
8
|
|
|
9
|
+
from re_common.v2.baselibrary.utils.string_clear import rel_clear
|
|
10
|
+
from re_common.v2.baselibrary.utils.string_smi import JaroDamerauLevenshteinMaxSim
|
|
11
|
+
|
|
8
12
|
|
|
9
13
|
def bj2qj(src):
|
|
10
14
|
if src is None:
|
|
@@ -143,4 +147,66 @@ def clean_html(html):
|
|
|
143
147
|
return parser.get_text()
|
|
144
148
|
|
|
145
149
|
|
|
150
|
+
def remove_spaces_between_chinese_characters(text):
|
|
151
|
+
"""
|
|
152
|
+
匹配中文间的空格并替换为空字符串
|
|
153
|
+
|
|
154
|
+
这里没有选取 后面的一些扩展分区 是那些分区比较分散 都写进来消耗性能,
|
|
155
|
+
认为只包含这些也够用了
|
|
156
|
+
"""
|
|
157
|
+
pattern = r'(?<=[\u3400-\u9fff])\s+(?=[\u3400-\u9fff])'
|
|
158
|
+
return re.sub(pattern, '', text)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
sim_utils = JaroDamerauLevenshteinMaxSim()
|
|
163
|
+
|
|
164
|
+
def group_similar_texts(texts, threshold=0.9):
|
|
165
|
+
"""根据相似度对文本进行分组"""
|
|
166
|
+
n = len(texts)
|
|
167
|
+
# 创建邻接表表示图
|
|
168
|
+
graph = [[] for _ in range(n)]
|
|
169
|
+
# 计算所有文本对的相似度并构建图
|
|
170
|
+
for i, j in combinations(range(n), 2):
|
|
171
|
+
similarity = sim_utils.get_sim(rel_clear(texts[i]), rel_clear(texts[j]))
|
|
172
|
+
if similarity >= threshold:
|
|
173
|
+
graph[i].append(j)
|
|
174
|
+
graph[j].append(i)
|
|
146
175
|
|
|
176
|
+
visited = [False] * n
|
|
177
|
+
groups = []
|
|
178
|
+
|
|
179
|
+
# 使用DFS找到连通分量
|
|
180
|
+
def dfs(node, group):
|
|
181
|
+
visited[node] = True
|
|
182
|
+
group.append(node)
|
|
183
|
+
for neighbor in graph[node]:
|
|
184
|
+
if not visited[neighbor]:
|
|
185
|
+
dfs(neighbor, group)
|
|
186
|
+
|
|
187
|
+
# 找到所有连通分量
|
|
188
|
+
for i in range(n):
|
|
189
|
+
if not visited[i]:
|
|
190
|
+
current_group = []
|
|
191
|
+
dfs(i, current_group)
|
|
192
|
+
groups.append(current_group)
|
|
193
|
+
|
|
194
|
+
return groups
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def get_group_abstract(lists):
|
|
198
|
+
"""
|
|
199
|
+
这是一个 分组程序 ,会根据简单的连通图分组
|
|
200
|
+
lists: [(id,txt),...]
|
|
201
|
+
return: all_list 返回一个二维列表 每个列表里面是id 每个列表为一个分组
|
|
202
|
+
"""
|
|
203
|
+
abstract_list = [i[1] for i in lists]
|
|
204
|
+
keyid_list = [i[0] for i in lists]
|
|
205
|
+
groups = group_similar_texts(abstract_list, threshold=0.9)
|
|
206
|
+
all_list = []
|
|
207
|
+
for group in groups:
|
|
208
|
+
t_list = []
|
|
209
|
+
for text_idx in group:
|
|
210
|
+
t_list.append(keyid_list[text_idx])
|
|
211
|
+
all_list.append(t_list)
|
|
212
|
+
return all_list
|
|
@@ -170,20 +170,21 @@ re_common/v2/baselibrary/tools/WeChatRobot.py,sha256=EaQgNncROAhU5-psYRGWAshIV5a
|
|
|
170
170
|
re_common/v2/baselibrary/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
171
171
|
re_common/v2/baselibrary/tools/dict_tools.py,sha256=BTh7oJuJ619IZgxiYlim0ltrXBclDtb7WzyFGr7wVf0,1246
|
|
172
172
|
re_common/v2/baselibrary/tools/dolphinscheduler.py,sha256=1m7UGYDiuvJUCI6ik6CGM2fO8U5XteJzn55VRbwB9ts,7978
|
|
173
|
-
re_common/v2/baselibrary/tools/list_tools.py,sha256=
|
|
173
|
+
re_common/v2/baselibrary/tools/list_tools.py,sha256=qYxdLccRbrULOBbaPdJ_MyFFmVJGVMdW5E36nJ3ejr8,249
|
|
174
174
|
re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=d_h9j7VxiXpcn1GHZ7L2tpx9_LDQshcl58tlKvSxZPg,1691
|
|
175
175
|
re_common/v2/baselibrary/tools/text_matcher.py,sha256=F4WtLO-b7H6V9TIvOntCD9ZXSQP_KijPuLLYcLPtrKQ,7021
|
|
176
176
|
re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
|
|
177
|
-
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=
|
|
177
|
+
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=__9MECbdrMnYc-ksYn2liM8vEbqF9uR4hZKqw86kW1Q,5924
|
|
178
178
|
re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
179
179
|
re_common/v2/baselibrary/utils/author_smi.py,sha256=1ebH3AHv19jtJWdlqNdwu6t58HNVLCotuCB6ff1SWiw,13666
|
|
180
|
-
re_common/v2/baselibrary/utils/basedict.py,sha256=
|
|
180
|
+
re_common/v2/baselibrary/utils/basedict.py,sha256=sH3_RZ8u4649-jX2V1uKNNkjJVUijZBDp6SdqncOZ88,1583
|
|
181
181
|
re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
|
|
182
182
|
re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
|
|
183
183
|
re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
|
|
184
184
|
re_common/v2/baselibrary/utils/string_bool.py,sha256=EJnkSck4ofcIeJ6nLzAOVtlt6o1WBgvgVwIqJKj5Suc,2993
|
|
185
|
-
re_common/v2/baselibrary/utils/string_clear.py,sha256=
|
|
186
|
-
re_common/v2/baselibrary/utils/
|
|
185
|
+
re_common/v2/baselibrary/utils/string_clear.py,sha256=pGxL9PlzQDM06sC0j6U0zYRemvsJ7-OOpfzS5ETCxAs,6258
|
|
186
|
+
re_common/v2/baselibrary/utils/string_smi.py,sha256=cU0WAWHRGnGoVQx3eCEKeM_q_olFNzRTJe7rSe586SY,741
|
|
187
|
+
re_common/v2/baselibrary/utils/stringutils.py,sha256=ISheMydPZeNmqsffGDT4Ut_UGpK3r6k8STR78Ere8Wg,6033
|
|
187
188
|
re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
188
189
|
re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
|
|
189
190
|
re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
|
|
@@ -210,8 +211,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
|
|
|
210
211
|
re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
|
|
211
212
|
re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
|
|
212
213
|
re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
|
|
213
|
-
re_common-10.0.
|
|
214
|
-
re_common-10.0.
|
|
215
|
-
re_common-10.0.
|
|
216
|
-
re_common-10.0.
|
|
217
|
-
re_common-10.0.
|
|
214
|
+
re_common-10.0.12.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
215
|
+
re_common-10.0.12.dist-info/METADATA,sha256=N9MX7TnI7lhFQyhFaV0n0wr5XWT5prKsFX9gsM-X4T4,582
|
|
216
|
+
re_common-10.0.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
217
|
+
re_common-10.0.12.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
|
|
218
|
+
re_common-10.0.12.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|