re-common 10.0.11__py3-none-any.whl → 10.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+
2
+ def check_no_duplicates_2d(lst_2d):
3
+ """检查二维列表的每一行是否无重复"""
4
+ for row in lst_2d:
5
+ # 将行转为集合,比较长度
6
+ if len(row) != len(set(row)):
7
+ return False
8
+ return True
9
+
@@ -114,3 +114,62 @@ def get_wos_author_abbr(author_row: str):
114
114
  abbr_list = [author.strip() for author in abbr_list if
115
115
  author.strip() and author.strip().lower() not in ("*", "and")]
116
116
  return ";".join(abbr_list)
117
+
118
+
119
+ def deal_rel_vol(vol_str: str):
120
+ """
121
+ 处理 期刊融合时的卷处理逻辑
122
+ """
123
+ if vol_str.replace(".", "").isdigit():
124
+ try:
125
+ float_num = float(vol_str)
126
+ if int(float_num) == float_num:
127
+ return str(int(float_num))
128
+ except:
129
+ pass
130
+
131
+ if vol_str.lower().startswith("v "):
132
+ vol_str = vol_str.lower().replace("v ", "").strip()
133
+ return vol_str
134
+ if vol_str.lower().startswith("volume "):
135
+ vol_str = vol_str.lower().replace("volume ", "").strip()
136
+ return vol_str
137
+ if vol_str.lower().startswith("vol. "):
138
+ vol_str = vol_str.lower().replace("vol. ", "").strip()
139
+ return vol_str
140
+ if vol_str.lower().startswith("vol "):
141
+ vol_str = vol_str.lower().replace("vol ", "").strip()
142
+ return vol_str
143
+ return vol_str
144
+
145
+
146
+ def deal_num_strs(input_str):
147
+ """
148
+ int后在str 防止有浮点型的表达方式
149
+ """
150
+ number_list = re.findall(r'\d+', input_str)
151
+ transformed_numbers = [str(int(num)) for num in number_list]
152
+
153
+ # 替换原字符串中的数字为转换后的数字
154
+ for num, transformed_num in zip(number_list, transformed_numbers):
155
+ input_str = input_str.replace(num, transformed_num)
156
+ return input_str
157
+
158
+
159
+ def deal_num(strs):
160
+ """
161
+ 将 期格式化 方便 group尤其是有横杆的数据
162
+ 该方法 为融合二次分割时使用,如果场景合适也可以用于其他地方
163
+ :param strs:
164
+ :return:
165
+ """
166
+ strs = strs.replace("-", "_").replace(".", "_").upper()
167
+ if strs.find("_") > -1:
168
+ start, end = strs.split("_")
169
+ start = deal_num_strs(start)
170
+ end = deal_num_strs(end)
171
+ strs = start + "_" + end
172
+ else:
173
+ strs = deal_num_strs(strs)
174
+
175
+ return strs
@@ -1,11 +1,12 @@
1
1
  import logging
2
+ from itertools import groupby
2
3
 
3
4
  logger = logging.getLogger(__name__) # 创建 logger 实例
4
5
 
5
6
 
6
7
  class BaseDict(object):
7
8
  @classmethod
8
- def flip_dict(cls, original_dict, raise_on_conflict=False):
9
+ def flip_dict(cls, original_dict, raise_on_conflict=True):
9
10
  """
10
11
  翻转字典:将 key 是字符串、value 是列表的字典,转换为 key 是原 value 列表中的元素、value 是原 key 的字典。
11
12
  :param original_dict: 原始字典
@@ -24,3 +25,14 @@ class BaseDict(object):
24
25
  f"Warning: Key conflict detected for {value}. Overwriting with new value: {key}.")
25
26
  flipped_dict[value] = key
26
27
  return flipped_dict
28
+
29
+ @classmethod
30
+ def get_temp_gid_dicts(cls,lists,key_name):
31
+ """
32
+ 对 列表字典 分组 组成 分组id的字典
33
+ """
34
+ dicts = {}
35
+ for group_id, group_tmp in groupby(sorted(lists, key=lambda x: x[key_name]),
36
+ key=lambda x: x[key_name]):
37
+ dicts[group_id] = group_tmp
38
+ return dicts
@@ -0,0 +1,18 @@
1
+ import jellyfish
2
+ from rapidfuzz.distance import DamerauLevenshtein
3
+
4
+
5
+ class JaroDamerauLevenshteinMaxSim(object):
6
+ """
7
+ jaro_similarity 有缺陷 以下样例数据会导致分很低
8
+ s1 = "in situ monitoring of semiconductor wafer temperature using infrared interfe rometry"
9
+ s2 = "insitu monitoring of semiconductor wafer temperature using infrared interferometry"
10
+ """
11
+
12
+ def get_sim(self, str1: str, str2: str) -> float:
13
+ similarity1 = jellyfish.jaro_similarity(str1, str2)
14
+ if str1.strip() == "" and str2.strip() == "":
15
+ similarity2 = 0
16
+ else:
17
+ similarity2 = 1 - DamerauLevenshtein.normalized_distance(str1, str2)
18
+ return max(similarity1, similarity2)
@@ -1,10 +1,14 @@
1
1
  import re
2
2
  import threading
3
+ from itertools import combinations
3
4
 
4
5
  import regex
5
6
  import unicodedata
6
7
  from html.parser import HTMLParser
7
8
 
9
+ from re_common.v2.baselibrary.utils.string_clear import rel_clear
10
+ from re_common.v2.baselibrary.utils.string_smi import JaroDamerauLevenshteinMaxSim
11
+
8
12
 
9
13
  def bj2qj(src):
10
14
  if src is None:
@@ -152,3 +156,57 @@ def remove_spaces_between_chinese_characters(text):
152
156
  """
153
157
  pattern = r'(?<=[\u3400-\u9fff])\s+(?=[\u3400-\u9fff])'
154
158
  return re.sub(pattern, '', text)
159
+
160
+
161
+
162
+ sim_utils = JaroDamerauLevenshteinMaxSim()
163
+
164
+ def group_similar_texts(texts, threshold=0.9):
165
+ """根据相似度对文本进行分组"""
166
+ n = len(texts)
167
+ # 创建邻接表表示图
168
+ graph = [[] for _ in range(n)]
169
+ # 计算所有文本对的相似度并构建图
170
+ for i, j in combinations(range(n), 2):
171
+ similarity = sim_utils.get_sim(rel_clear(texts[i]), rel_clear(texts[j]))
172
+ if similarity >= threshold:
173
+ graph[i].append(j)
174
+ graph[j].append(i)
175
+
176
+ visited = [False] * n
177
+ groups = []
178
+
179
+ # 使用DFS找到连通分量
180
+ def dfs(node, group):
181
+ visited[node] = True
182
+ group.append(node)
183
+ for neighbor in graph[node]:
184
+ if not visited[neighbor]:
185
+ dfs(neighbor, group)
186
+
187
+ # 找到所有连通分量
188
+ for i in range(n):
189
+ if not visited[i]:
190
+ current_group = []
191
+ dfs(i, current_group)
192
+ groups.append(current_group)
193
+
194
+ return groups
195
+
196
+
197
+ def get_group_abstract(lists):
198
+ """
199
+ 这是一个 分组程序 ,会根据简单的连通图分组
200
+ lists: [(id,txt),...]
201
+ return: all_list 返回一个二维列表 每个列表里面是id 每个列表为一个分组
202
+ """
203
+ abstract_list = [i[1] for i in lists]
204
+ keyid_list = [i[0] for i in lists]
205
+ groups = group_similar_texts(abstract_list, threshold=0.9)
206
+ all_list = []
207
+ for group in groups:
208
+ t_list = []
209
+ for text_idx in group:
210
+ t_list.append(keyid_list[text_idx])
211
+ all_list.append(t_list)
212
+ return all_list
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 10.0.11
3
+ Version: 10.0.12
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic
@@ -170,20 +170,21 @@ re_common/v2/baselibrary/tools/WeChatRobot.py,sha256=EaQgNncROAhU5-psYRGWAshIV5a
170
170
  re_common/v2/baselibrary/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
171
171
  re_common/v2/baselibrary/tools/dict_tools.py,sha256=BTh7oJuJ619IZgxiYlim0ltrXBclDtb7WzyFGr7wVf0,1246
172
172
  re_common/v2/baselibrary/tools/dolphinscheduler.py,sha256=1m7UGYDiuvJUCI6ik6CGM2fO8U5XteJzn55VRbwB9ts,7978
173
- re_common/v2/baselibrary/tools/list_tools.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
173
+ re_common/v2/baselibrary/tools/list_tools.py,sha256=qYxdLccRbrULOBbaPdJ_MyFFmVJGVMdW5E36nJ3ejr8,249
174
174
  re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=d_h9j7VxiXpcn1GHZ7L2tpx9_LDQshcl58tlKvSxZPg,1691
175
175
  re_common/v2/baselibrary/tools/text_matcher.py,sha256=F4WtLO-b7H6V9TIvOntCD9ZXSQP_KijPuLLYcLPtrKQ,7021
176
176
  re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
177
- re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=tzjVr_-6iPAKTt14hR-BhRshdRgeT_MPJpUQkxcTXns,4084
177
+ re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=__9MECbdrMnYc-ksYn2liM8vEbqF9uR4hZKqw86kW1Q,5924
178
178
  re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
179
179
  re_common/v2/baselibrary/utils/author_smi.py,sha256=1ebH3AHv19jtJWdlqNdwu6t58HNVLCotuCB6ff1SWiw,13666
180
- re_common/v2/baselibrary/utils/basedict.py,sha256=tSV85pARe8ZQDY77_h_heS81EWwcgJW076DcA9WQyjY,1161
180
+ re_common/v2/baselibrary/utils/basedict.py,sha256=sH3_RZ8u4649-jX2V1uKNNkjJVUijZBDp6SdqncOZ88,1583
181
181
  re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
182
182
  re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
183
183
  re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
184
184
  re_common/v2/baselibrary/utils/string_bool.py,sha256=EJnkSck4ofcIeJ6nLzAOVtlt6o1WBgvgVwIqJKj5Suc,2993
185
185
  re_common/v2/baselibrary/utils/string_clear.py,sha256=pGxL9PlzQDM06sC0j6U0zYRemvsJ7-OOpfzS5ETCxAs,6258
186
- re_common/v2/baselibrary/utils/stringutils.py,sha256=watvMwx8gzEj0Swz7e1cFUUQE1UkN81Fw-Hkjs4l8lo,4233
186
+ re_common/v2/baselibrary/utils/string_smi.py,sha256=cU0WAWHRGnGoVQx3eCEKeM_q_olFNzRTJe7rSe586SY,741
187
+ re_common/v2/baselibrary/utils/stringutils.py,sha256=ISheMydPZeNmqsffGDT4Ut_UGpK3r6k8STR78Ere8Wg,6033
187
188
  re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
188
189
  re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
189
190
  re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
@@ -210,8 +211,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
210
211
  re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
211
212
  re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
212
213
  re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
213
- re_common-10.0.11.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
214
- re_common-10.0.11.dist-info/METADATA,sha256=5g6SC3mrd2cryFaMmajqme2KGUoyoEkoDDwtqGeCYso,582
215
- re_common-10.0.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
216
- re_common-10.0.11.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
217
- re_common-10.0.11.dist-info/RECORD,,
214
+ re_common-10.0.12.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
215
+ re_common-10.0.12.dist-info/METADATA,sha256=N9MX7TnI7lhFQyhFaV0n0wr5XWT5prKsFX9gsM-X4T4,582
216
+ re_common-10.0.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
217
+ re_common-10.0.12.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
218
+ re_common-10.0.12.dist-info/RECORD,,