re-common 10.0.0__py3-none-any.whl → 10.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- # 某些业务中的字符串处理 算是特定场景的工具
1
+ # 某些业务中的字符串处理 算是特定场景的工具 不算通用工具
2
2
  import re
3
3
 
4
4
 
@@ -56,7 +56,7 @@ def get_first_organ(organ):
56
56
  for organ_one in organ_list:
57
57
  # 清理邮政编码
58
58
  organ_one = clean_organ_postcode(organ_one)
59
- if organ_one:
59
+ if organ_one.strip():
60
60
  return organ_one
61
61
 
62
62
  return ""
@@ -69,6 +69,48 @@ def get_first_author(author: str) -> str:
69
69
  for au in au_list:
70
70
  au = re.sub("\\[.*?]", "", au)
71
71
  au = re.sub("\\(.*?\\)", "", au)
72
- if au:
72
+ if au.strip():
73
73
  return au
74
74
  return ""
75
+
76
+
77
+ def get_author_list(author: str):
78
+ lists = []
79
+ if not author:
80
+ return []
81
+ au_list = author.strip().split(";")
82
+ for au in au_list:
83
+ au = re.sub("\\[.*?]", "", au)
84
+ au = re.sub("\\(.*?\\)", "", au)
85
+ if au.strip():
86
+ lists.append(au.strip())
87
+ return lists
88
+
89
+
90
+ def get_scopus_author_abbr(author_row: str):
91
+ if not author_row:
92
+ return ""
93
+ author_list = author_row.split("&&")
94
+ if len(author_list) != 3:
95
+ raise Exception("错误的数据个数 可能来自其他数据源")
96
+
97
+ abbr_list = author_list[0].strip().split(";")
98
+ abbr_list = [author.strip() for author in abbr_list if
99
+ author.strip() and author.strip().lower() not in ("*", "and")]
100
+ return ";".join(abbr_list)
101
+
102
+
103
+ def get_wos_author_abbr(author_row: str):
104
+ if not author_row:
105
+ return ""
106
+ author_list = author_row.split("&&")
107
+ if len(author_list) != 4:
108
+ raise Exception("错误的数据个数 可能来自其他数据源")
109
+ abbr_list = []
110
+ abbr_list_au = author_list[0].strip().split(";")
111
+ abbr_list_ba = author_list[2].strip().split(";")
112
+ abbr_list.extend(abbr_list_au)
113
+ abbr_list.extend(abbr_list_ba)
114
+ abbr_list = [author.strip() for author in abbr_list if
115
+ author.strip() and author.strip().lower() not in ("*", "and")]
116
+ return ";".join(abbr_list)
@@ -1,3 +1,4 @@
1
+ import copy
1
2
  import re
2
3
  import string
3
4
 
@@ -126,11 +127,6 @@ def custom_rstrip(s):
126
127
  return s
127
128
 
128
129
 
129
- def deal_str_first(s1):
130
- # 先对数据处理一波
131
- s1 = s1.replace("’", "")
132
- return s1
133
-
134
130
 
135
131
  def AuthorRatio(
136
132
  s1,
@@ -146,7 +142,7 @@ def AuthorRatio(
146
142
  if is_none(s1) or is_none(s2):
147
143
  return 0
148
144
 
149
- # 处理字符串的程序
145
+ # 处理字符串的程序 外围传入方法
150
146
  if processor is not None:
151
147
  s1 = processor(s1)
152
148
  s2 = processor(s2)
@@ -154,12 +150,14 @@ def AuthorRatio(
154
150
  # 处理后是否为空字符串,如果有 返回0
155
151
  if not s1 or not s2:
156
152
  return 0
157
-
153
+ # 处理音标问题
158
154
  s1 = get_diacritic_variant(s1)
159
155
  s2 = get_diacritic_variant(s2)
160
156
  # 这里提出来是为了少计算 但后期需要平衡内存和算力
157
+ # 移除指定符号 这里做了小写化处理
161
158
  s1_punc = remove_punctuation(s1)
162
159
  s2_punc = remove_punctuation(s2)
160
+ # 分成列表
163
161
  s1_punc_split = s1_punc.split()
164
162
  s2_punc_split = s2_punc.split()
165
163
 
@@ -235,6 +233,11 @@ def AuthorRatio(
235
233
  # 如果循环结束都没有提前返回 False,则表示两个字符串完全匹配,返回 True
236
234
  return True
237
235
 
236
+ # 防止清理后 一方变为空字符串
237
+ if len(l1) == 0 or len(l2) == 0:
238
+ return 0
239
+
240
+ # 这里的逻辑是最后的位置全大写就将他拆分散 比如 joi CJ -> joi C J
238
241
  if len(l1[-1]) != 1 and l1[-1].isupper():
239
242
  t_str = l1[-1]
240
243
  l1 = l1[:-1]
@@ -248,6 +251,16 @@ def AuthorRatio(
248
251
  if len(l1) == len(l2) and (is_same_or_initials_match(l1, l2) or set(l1) == set(l2)):
249
252
  return 1
250
253
 
254
+ # 在这里针对上面一条算法再增加一条算法,先对list 排序在对他进行上面的对比
255
+ # 如果长度相等 简写也是单词的首字母 那么两个名字一致 举例:Guo, Qiang @@ Q. Guo
256
+ sort_l1 = copy.deepcopy(l1)
257
+ sort_l2 = copy.deepcopy(l2)
258
+ sort_l1.sort()
259
+ sort_l2.sort()
260
+ if len(sort_l1) == len(sort_l2) and (is_same_or_initials_match(sort_l1, sort_l2) or set(sort_l1) == set(sort_l2)):
261
+ return 0.99
262
+
263
+
251
264
  ##############################################################
252
265
  # 以上为情况穷举情况,以下为其他情况的相似率计算
253
266
  ##############################################################
@@ -262,7 +275,7 @@ def AuthorRatio(
262
275
  len_ratio = len1 / len2 if len1 > len2 else len2 / len1
263
276
 
264
277
  # 计算归一化的 Indel 相似度。 对于比率<score_cutoff,返回0。
265
- end_ratio = normal_end_ratio = Jaro.normalized_similarity(s1, s2)
278
+ end_ratio = normal_end_ratio = Jaro.normalized_similarity(s1.lower(), s2.lower())
266
279
 
267
280
  # 需要对作者的比率分布进行调研决定哪些是小比率哪些是大比率
268
281
  if len_ratio > 1.5 and len_ratio < 3:
@@ -287,7 +300,7 @@ def AuthorRatio(
287
300
 
288
301
  # 首字母相同提分
289
302
  # if is_contained(extract_initials(s1), extract_initials(s2)):
290
- if is_contained_list([i[:1] for i in l1], [i[:1] for i in l2]):
303
+ if is_contained_list([i[:1].lower() for i in l1], [i[:1].lower() for i in l2]):
291
304
  # 应该提分
292
305
  end_ratio = end_ratio * 1.05
293
306
  else:
@@ -302,7 +315,7 @@ def AuthorRatio(
302
315
  end_ratio = end_ratio * 1.1
303
316
 
304
317
  if l1[0] != l2[0]:
305
- end_ratio = end_ratio * Jaro.normalized_similarity(l1[0], l2[0])
318
+ end_ratio = end_ratio * Jaro.normalized_similarity(l1[0].lower(), l2[0].lower())
306
319
 
307
320
  # 如果字符串本身的相似度高 应该拉上去 否者应该拉下来
308
321
  return min(end_ratio, 1) * 0.5 + normal_end_ratio * 0.5
@@ -7,3 +7,20 @@ def is_all_english_chars(s):
7
7
 
8
8
  def contains_chinese_chars(s):
9
9
  return bool(re.search(r'[\u3400-\u9fff]', s))
10
+
11
+
12
+ def is_empty(value):
13
+ # 如果是 None,直接返回 True
14
+ if value is None:
15
+ return True
16
+
17
+ # 如果是字符串,检查去除空白后是否为空
18
+ if isinstance(value, str):
19
+ return value.strip() == ""
20
+
21
+ # 可选:处理其他可迭代类型(如列表、字典等)
22
+ if hasattr(value, "__len__"):
23
+ return len(value) == 0
24
+
25
+ # 默认情况下,非 None、非空类型返回 False
26
+ return False
@@ -66,6 +66,11 @@ class StringClear(object):
66
66
  self.obj_str = re.sub("[_]", "", self.obj_str)
67
67
  return self
68
68
 
69
+ def replace_dash_with_space(self):
70
+ # 横线换成空格 比 去除符号有时更有用
71
+ self.obj_str = self.obj_str.replace("-", " ")
72
+ return self
73
+
69
74
  def remove_diacritics(self):
70
75
  # 去除音标 转换成字母
71
76
  self.obj_str = get_diacritic_variant(self.obj_str)
@@ -81,6 +86,26 @@ class StringClear(object):
81
86
  self.obj_str = re.sub("\\(.*?\\)", "", self.obj_str)
82
87
  return self
83
88
 
89
+ def remove_html_tag(self):
90
+ import html
91
+ from parsel import Selector
92
+
93
+ self.obj_str = html.unescape(self.obj_str)
94
+
95
+ def clean_html(html):
96
+ sel = Selector(text=html)
97
+ # 移除脚本和样式
98
+ sel.xpath('//script').remove()
99
+ sel.xpath('//style').remove()
100
+
101
+ # 提取文本
102
+ text_nodes = sel.xpath('//text()').getall()
103
+ return ''.join(t.strip() for t in text_nodes if t.strip())
104
+
105
+ self.obj_str = clean_html(self.obj_str)
106
+
107
+ return self
108
+
84
109
  def get_str(self):
85
110
  return self.obj_str
86
111
 
@@ -89,8 +114,10 @@ def rel_clear(str_obj):
89
114
  # 为融合数据定制的 清理规则
90
115
  return (StringClear(str_obj)
91
116
  .None_to_str() # 空对象转str 防止空对象
92
- .to_str() # 防止其他类型传入
117
+ .to_str() # 防止其他类型传入 比如 int double
93
118
  .qj_to_bj() # 全角转半角
119
+ .remove_html_tag() # html标签清理
120
+ .replace_dash_with_space() # 横线转空格 在 英文 title 中更有用
94
121
  .remove_special_chars() # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
95
122
  .collapse_spaces() # 移除多余空格,连续多个空格变一个
96
123
  .lower() # 小写
@@ -63,6 +63,7 @@ def get_diacritic_variant(char1):
63
63
  return base_char1
64
64
 
65
65
  def get_alphabetic_ratio(text: str) -> float:
66
+ # 返回字母型字符所占比例
66
67
  if not text:
67
68
  return 0
68
69
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 10.0.0
3
+ Version: 10.0.2
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic
@@ -171,15 +171,15 @@ re_common/v2/baselibrary/tools/list_tools.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
171
171
  re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=o-PNjmcYDkfyiR75Jci_9sSn4cGi_F9jPCIrwYdnb1U,1013
172
172
  re_common/v2/baselibrary/tools/text_matcher.py,sha256=F4WtLO-b7H6V9TIvOntCD9ZXSQP_KijPuLLYcLPtrKQ,7021
173
173
  re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
174
- re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=dxrWO800wElZM_4aKolUHSPBYZlxqzXukE4M-LZ13jA,2644
174
+ re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=tzjVr_-6iPAKTt14hR-BhRshdRgeT_MPJpUQkxcTXns,4084
175
175
  re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
176
- re_common/v2/baselibrary/utils/author_smi.py,sha256=_P3I5JXvxHqNNWUwhAyHiJuBFiC0tXvGD8-_HxNiuEU,11051
176
+ re_common/v2/baselibrary/utils/author_smi.py,sha256=Mjl0GYH9e0TP48yxnxC7qgMP2bZW04pa8TQezpKo9L0,11796
177
177
  re_common/v2/baselibrary/utils/basedict.py,sha256=tSV85pARe8ZQDY77_h_heS81EWwcgJW076DcA9WQyjY,1161
178
178
  re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
179
179
  re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
180
- re_common/v2/baselibrary/utils/string_bool.py,sha256=f5qYdKvTufxmfSsxXN41WFLV--vCwDWU2LeQPbDvKZY,178
181
- re_common/v2/baselibrary/utils/string_clear.py,sha256=LqGvv-UZnsVwiDBN3-PdzDUTfWlAsKsvKlkXqySI0eE,3244
182
- re_common/v2/baselibrary/utils/stringutils.py,sha256=lhDvRL60S6gjhU4D0nfk2Y-c25IyYdYOD0TMoCx-huE,2658
180
+ re_common/v2/baselibrary/utils/string_bool.py,sha256=4VCr1g8pX5YnzZSKctQgQfmhSQ0aw7a8ruhWdiRmBFU,641
181
+ re_common/v2/baselibrary/utils/string_clear.py,sha256=R3Asus3NcmL-4SVLsfhYmP7YQwB-H7iCCFPnl9eKO7A,4157
182
+ re_common/v2/baselibrary/utils/stringutils.py,sha256=quAgCdW_ayQwY4AqnZZkZ4NlcSEcy6f1arOVSeP2vEo,2699
183
183
  re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
184
184
  re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
185
185
  re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
@@ -206,8 +206,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
206
206
  re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
207
207
  re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
208
208
  re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
209
- re_common-10.0.0.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
210
- re_common-10.0.0.dist-info/METADATA,sha256=C8xtx6EWq_g7ScVYYKNZRwq7IuZ_z2esfPwhztPshE0,581
211
- re_common-10.0.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
212
- re_common-10.0.0.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
213
- re_common-10.0.0.dist-info/RECORD,,
209
+ re_common-10.0.2.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
210
+ re_common-10.0.2.dist-info/METADATA,sha256=oVKxavSnd8Vne03NpymV_GLIR6DXI7UfDBBBj2CgVbc,581
211
+ re_common-10.0.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
212
+ re_common-10.0.2.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
213
+ re_common-10.0.2.dist-info/RECORD,,