re-common 10.0.1__py3-none-any.whl → 10.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- # 某些业务中的字符串处理 算是特定场景的工具
1
+ # 某些业务中的字符串处理 算是特定场景的工具 不算通用工具
2
2
  import re
3
3
 
4
4
 
@@ -56,7 +56,7 @@ def get_first_organ(organ):
56
56
  for organ_one in organ_list:
57
57
  # 清理邮政编码
58
58
  organ_one = clean_organ_postcode(organ_one)
59
- if organ_one:
59
+ if organ_one.strip():
60
60
  return organ_one
61
61
 
62
62
  return ""
@@ -69,6 +69,48 @@ def get_first_author(author: str) -> str:
69
69
  for au in au_list:
70
70
  au = re.sub("\\[.*?]", "", au)
71
71
  au = re.sub("\\(.*?\\)", "", au)
72
- if au:
72
+ if au.strip():
73
73
  return au
74
74
  return ""
75
+
76
+
77
+ def get_author_list(author: str):
78
+ lists = []
79
+ if not author:
80
+ return []
81
+ au_list = author.strip().split(";")
82
+ for au in au_list:
83
+ au = re.sub("\\[.*?]", "", au)
84
+ au = re.sub("\\(.*?\\)", "", au)
85
+ if au.strip():
86
+ lists.append(au.strip())
87
+ return lists
88
+
89
+
90
+ def get_scopus_author_abbr(author_row: str):
91
+ if not author_row:
92
+ return ""
93
+ author_list = author_row.split("&&")
94
+ if len(author_list) != 3:
95
+ raise Exception("错误的数据个数 可能来自其他数据源")
96
+
97
+ abbr_list = author_list[0].strip().split(";")
98
+ abbr_list = [author.strip() for author in abbr_list if
99
+ author.strip() and author.strip().lower() not in ("*", "and")]
100
+ return ";".join(abbr_list)
101
+
102
+
103
+ def get_wos_author_abbr(author_row: str):
104
+ if not author_row:
105
+ return ""
106
+ author_list = author_row.split("&&")
107
+ if len(author_list) != 4:
108
+ raise Exception("错误的数据个数 可能来自其他数据源")
109
+ abbr_list = []
110
+ abbr_list_au = author_list[0].strip().split(";")
111
+ abbr_list_ba = author_list[2].strip().split(";")
112
+ abbr_list.extend(abbr_list_au)
113
+ abbr_list.extend(abbr_list_ba)
114
+ abbr_list = [author.strip() for author in abbr_list if
115
+ author.strip() and author.strip().lower() not in ("*", "and")]
116
+ return ";".join(abbr_list)
@@ -127,11 +127,6 @@ def custom_rstrip(s):
127
127
  return s
128
128
 
129
129
 
130
- def deal_str_first(s1):
131
- # 先对数据处理一波
132
- s1 = s1.replace("’", "")
133
- return s1
134
-
135
130
 
136
131
  def AuthorRatio(
137
132
  s1,
@@ -147,7 +142,7 @@ def AuthorRatio(
147
142
  if is_none(s1) or is_none(s2):
148
143
  return 0
149
144
 
150
- # 处理字符串的程序
145
+ # 处理字符串的程序 外围传入方法
151
146
  if processor is not None:
152
147
  s1 = processor(s1)
153
148
  s2 = processor(s2)
@@ -155,12 +150,14 @@ def AuthorRatio(
155
150
  # 处理后是否为空字符串,如果有 返回0
156
151
  if not s1 or not s2:
157
152
  return 0
158
-
153
+ # 处理音标问题
159
154
  s1 = get_diacritic_variant(s1)
160
155
  s2 = get_diacritic_variant(s2)
161
156
  # 这里提出来是为了少计算 但后期需要平衡内存和算力
157
+ # 移除指定符号 这里做了小写化处理
162
158
  s1_punc = remove_punctuation(s1)
163
159
  s2_punc = remove_punctuation(s2)
160
+ # 分成列表
164
161
  s1_punc_split = s1_punc.split()
165
162
  s2_punc_split = s2_punc.split()
166
163
 
@@ -236,6 +233,11 @@ def AuthorRatio(
236
233
  # 如果循环结束都没有提前返回 False,则表示两个字符串完全匹配,返回 True
237
234
  return True
238
235
 
236
+ # 防止清理后 一方变为空字符串
237
+ if len(l1) == 0 or len(l2) == 0:
238
+ return 0
239
+
240
+ # 这里的逻辑是最后的位置全大写就将他拆分散 比如 joi CJ -> joi C J
239
241
  if len(l1[-1]) != 1 and l1[-1].isupper():
240
242
  t_str = l1[-1]
241
243
  l1 = l1[:-1]
@@ -7,3 +7,20 @@ def is_all_english_chars(s):
7
7
 
8
8
  def contains_chinese_chars(s):
9
9
  return bool(re.search(r'[\u3400-\u9fff]', s))
10
+
11
+
12
+ def is_empty(value):
13
+ # 如果是 None,直接返回 True
14
+ if value is None:
15
+ return True
16
+
17
+ # 如果是字符串,检查去除空白后是否为空
18
+ if isinstance(value, str):
19
+ return value.strip() == ""
20
+
21
+ # 可选:处理其他可迭代类型(如列表、字典等)
22
+ if hasattr(value, "__len__"):
23
+ return len(value) == 0
24
+
25
+ # 默认情况下,非 None、非空类型返回 False
26
+ return False
@@ -66,6 +66,11 @@ class StringClear(object):
66
66
  self.obj_str = re.sub("[_]", "", self.obj_str)
67
67
  return self
68
68
 
69
+ def replace_dash_with_space(self):
70
+ # 横线换成空格 比 去除符号有时更有用
71
+ self.obj_str = self.obj_str.replace("-", " ")
72
+ return self
73
+
69
74
  def remove_diacritics(self):
70
75
  # 去除音标 转换成字母
71
76
  self.obj_str = get_diacritic_variant(self.obj_str)
@@ -81,6 +86,26 @@ class StringClear(object):
81
86
  self.obj_str = re.sub("\\(.*?\\)", "", self.obj_str)
82
87
  return self
83
88
 
89
+ def remove_html_tag(self):
90
+ import html
91
+ from parsel import Selector
92
+
93
+ self.obj_str = html.unescape(self.obj_str)
94
+
95
+ def clean_html(html):
96
+ sel = Selector(text=html)
97
+ # 移除脚本和样式
98
+ sel.xpath('//script').remove()
99
+ sel.xpath('//style').remove()
100
+
101
+ # 提取文本
102
+ text_nodes = sel.xpath('//text()').getall()
103
+ return ''.join(t.strip() for t in text_nodes if t.strip())
104
+
105
+ self.obj_str = clean_html(self.obj_str)
106
+
107
+ return self
108
+
84
109
  def get_str(self):
85
110
  return self.obj_str
86
111
 
@@ -89,8 +114,10 @@ def rel_clear(str_obj):
89
114
  # 为融合数据定制的 清理规则
90
115
  return (StringClear(str_obj)
91
116
  .None_to_str() # 空对象转str 防止空对象
92
- .to_str() # 防止其他类型传入
117
+ .to_str() # 防止其他类型传入 比如 int double
93
118
  .qj_to_bj() # 全角转半角
119
+ .remove_html_tag() # html标签清理
120
+ .replace_dash_with_space() # 横线转空格 在 英文 title 中更有用
94
121
  .remove_special_chars() # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
95
122
  .collapse_spaces() # 移除多余空格,连续多个空格变一个
96
123
  .lower() # 小写
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 10.0.1
3
+ Version: 10.0.2
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic
@@ -171,14 +171,14 @@ re_common/v2/baselibrary/tools/list_tools.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
171
171
  re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=o-PNjmcYDkfyiR75Jci_9sSn4cGi_F9jPCIrwYdnb1U,1013
172
172
  re_common/v2/baselibrary/tools/text_matcher.py,sha256=F4WtLO-b7H6V9TIvOntCD9ZXSQP_KijPuLLYcLPtrKQ,7021
173
173
  re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
174
- re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=dxrWO800wElZM_4aKolUHSPBYZlxqzXukE4M-LZ13jA,2644
174
+ re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=tzjVr_-6iPAKTt14hR-BhRshdRgeT_MPJpUQkxcTXns,4084
175
175
  re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
176
- re_common/v2/baselibrary/utils/author_smi.py,sha256=wkuoGEBNM28k8D1E83vBxJD5N4xgzr6aAQFMVPJ2tnc,11585
176
+ re_common/v2/baselibrary/utils/author_smi.py,sha256=Mjl0GYH9e0TP48yxnxC7qgMP2bZW04pa8TQezpKo9L0,11796
177
177
  re_common/v2/baselibrary/utils/basedict.py,sha256=tSV85pARe8ZQDY77_h_heS81EWwcgJW076DcA9WQyjY,1161
178
178
  re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
179
179
  re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
180
- re_common/v2/baselibrary/utils/string_bool.py,sha256=f5qYdKvTufxmfSsxXN41WFLV--vCwDWU2LeQPbDvKZY,178
181
- re_common/v2/baselibrary/utils/string_clear.py,sha256=LqGvv-UZnsVwiDBN3-PdzDUTfWlAsKsvKlkXqySI0eE,3244
180
+ re_common/v2/baselibrary/utils/string_bool.py,sha256=4VCr1g8pX5YnzZSKctQgQfmhSQ0aw7a8ruhWdiRmBFU,641
181
+ re_common/v2/baselibrary/utils/string_clear.py,sha256=R3Asus3NcmL-4SVLsfhYmP7YQwB-H7iCCFPnl9eKO7A,4157
182
182
  re_common/v2/baselibrary/utils/stringutils.py,sha256=quAgCdW_ayQwY4AqnZZkZ4NlcSEcy6f1arOVSeP2vEo,2699
183
183
  re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
184
184
  re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
@@ -206,8 +206,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
206
206
  re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
207
207
  re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
208
208
  re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
209
- re_common-10.0.1.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
210
- re_common-10.0.1.dist-info/METADATA,sha256=xIF1hPdvDgN_bQ3YpyAG3_tjxGOIVQvNUM5NraOe73o,581
211
- re_common-10.0.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
212
- re_common-10.0.1.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
213
- re_common-10.0.1.dist-info/RECORD,,
209
+ re_common-10.0.2.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
210
+ re_common-10.0.2.dist-info/METADATA,sha256=oVKxavSnd8Vne03NpymV_GLIR6DXI7UfDBBBj2CgVbc,581
211
+ re_common-10.0.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
212
+ re_common-10.0.2.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
213
+ re_common-10.0.2.dist-info/RECORD,,