PyPI - re-common - Versions diffs - 10.0.0__py3-none-any.whl → 10.0.2__py3-none-any.whl - Mend

re-common 10.0.0py3-none-any.whl → 10.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

re_common/v2/baselibrary/utils/BusinessStringUtil.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# 某些业务中的字符串处理 算是特定场景的工具
+# 某些业务中的字符串处理 算是特定场景的工具 不算通用工具
 import re
@@ -56,7 +56,7 @@ def get_first_organ(organ):
     for organ_one in organ_list:
         # 清理邮政编码
         organ_one = clean_organ_postcode(organ_one)
-        if organ_one:
+        if organ_one.strip():
             return organ_one
     return ""
@@ -69,6 +69,48 @@ def get_first_author(author: str) -> str:
     for au in au_list:
         au = re.sub("\\[.*?]", "", au)
         au = re.sub("\\(.*?\\)", "", au)
-        if au:
+        if au.strip():
             return au
     return ""
+def get_author_list(author: str):
+    lists = []
+    if not author:
+        return []
+    au_list = author.strip().split(";")
+    for au in au_list:
+        au = re.sub("\\[.*?]", "", au)
+        au = re.sub("\\(.*?\\)", "", au)
+        if au.strip():
+            lists.append(au.strip())
+    return lists
+def get_scopus_author_abbr(author_row: str):
+    if not author_row:
+        return ""
+    author_list = author_row.split("&&")
+    if len(author_list) != 3:
+        raise Exception("错误的数据个数 可能来自其他数据源")
+    abbr_list = author_list[0].strip().split(";")
+    abbr_list = [author.strip() for author in abbr_list if
+                 author.strip() and author.strip().lower() not in ("*", "and")]
+    return ";".join(abbr_list)
+def get_wos_author_abbr(author_row: str):
+    if not author_row:
+        return ""
+    author_list = author_row.split("&&")
+    if len(author_list) != 4:
+        raise Exception("错误的数据个数 可能来自其他数据源")
+    abbr_list = []
+    abbr_list_au = author_list[0].strip().split(";")
+    abbr_list_ba = author_list[2].strip().split(";")
+    abbr_list.extend(abbr_list_au)
+    abbr_list.extend(abbr_list_ba)
+    abbr_list = [author.strip() for author in abbr_list if
+                 author.strip() and author.strip().lower() not in ("*", "and")]
+    return ";".join(abbr_list)

re_common/v2/baselibrary/utils/author_smi.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import copy
 import re
 import string
@@ -126,11 +127,6 @@ def custom_rstrip(s):
     return s
-def deal_str_first(s1):
-    # 先对数据处理一波
-    s1 = s1.replace("’", "")
-    return s1
 def AuthorRatio(
         s1,
@@ -146,7 +142,7 @@ def AuthorRatio(
     if is_none(s1) or is_none(s2):
         return 0
-    # 处理字符串的程序
+    # 处理字符串的程序 外围传入方法
     if processor is not None:
         s1 = processor(s1)
         s2 = processor(s2)
@@ -154,12 +150,14 @@ def AuthorRatio(
         # 处理后是否为空字符串，如果有 返回0
         if not s1 or not s2:
             return 0
+    # 处理音标问题
     s1 = get_diacritic_variant(s1)
     s2 = get_diacritic_variant(s2)
     # 这里提出来是为了少计算 但后期需要平衡内存和算力
+    # 移除指定符号 这里做了小写化处理
     s1_punc = remove_punctuation(s1)
     s2_punc = remove_punctuation(s2)
+    # 分成列表
     s1_punc_split = s1_punc.split()
     s2_punc_split = s2_punc.split()
@@ -235,6 +233,11 @@ def AuthorRatio(
         # 如果循环结束都没有提前返回 False，则表示两个字符串完全匹配，返回 True
         return True
+    # 防止清理后 一方变为空字符串
+    if len(l1) == 0 or len(l2) == 0:
+        return 0
+    #  这里的逻辑是最后的位置全大写就将他拆分散 比如 joi CJ -> joi C J
     if len(l1[-1]) != 1 and l1[-1].isupper():
         t_str = l1[-1]
         l1 = l1[:-1]
@@ -248,6 +251,16 @@ def AuthorRatio(
     if len(l1) == len(l2) and (is_same_or_initials_match(l1, l2) or set(l1) == set(l2)):
         return 1
+    # 在这里针对上面一条算法再增加一条算法，先对list 排序在对他进行上面的对比
+    # 如果长度相等 简写也是单词的首字母 那么两个名字一致 举例:Guo, Qiang @@ Q. Guo
+    sort_l1 = copy.deepcopy(l1)
+    sort_l2 = copy.deepcopy(l2)
+    sort_l1.sort()
+    sort_l2.sort()
+    if len(sort_l1) == len(sort_l2) and (is_same_or_initials_match(sort_l1, sort_l2) or set(sort_l1) == set(sort_l2)):
+        return 0.99
     ##############################################################
     # 以上为情况穷举情况，以下为其他情况的相似率计算
     ##############################################################
@@ -262,7 +275,7 @@ def AuthorRatio(
     len_ratio = len1 / len2 if len1 > len2 else len2 / len1
     # 计算归一化的 Indel 相似度。 对于比率<score_cutoff，返回0。
-    end_ratio = normal_end_ratio = Jaro.normalized_similarity(s1, s2)
+    end_ratio = normal_end_ratio = Jaro.normalized_similarity(s1.lower(), s2.lower())
     # 需要对作者的比率分布进行调研决定哪些是小比率哪些是大比率
     if len_ratio > 1.5 and len_ratio < 3:
@@ -287,7 +300,7 @@ def AuthorRatio(
     # 首字母相同提分
     # if is_contained(extract_initials(s1), extract_initials(s2)):
-    if is_contained_list([i[:1] for i in l1], [i[:1] for i in l2]):
+    if is_contained_list([i[:1].lower() for i in l1], [i[:1].lower() for i in l2]):
         # 应该提分
         end_ratio = end_ratio * 1.05
     else:
@@ -302,7 +315,7 @@ def AuthorRatio(
         end_ratio = end_ratio * 1.1
     if l1[0] != l2[0]:
-        end_ratio = end_ratio * Jaro.normalized_similarity(l1[0], l2[0])
+        end_ratio = end_ratio * Jaro.normalized_similarity(l1[0].lower(), l2[0].lower())
     # 如果字符串本身的相似度高 应该拉上去 否者应该拉下来
     return min(end_ratio, 1) * 0.5 + normal_end_ratio * 0.5

re_common/v2/baselibrary/utils/string_bool.py CHANGED Viewed

@@ -7,3 +7,20 @@ def is_all_english_chars(s):
 def contains_chinese_chars(s):
     return bool(re.search(r'[\u3400-\u9fff]', s))
+def is_empty(value):
+    # 如果是 None，直接返回 True
+    if value is None:
+        return True
+    # 如果是字符串，检查去除空白后是否为空
+    if isinstance(value, str):
+        return value.strip() == ""
+    # 可选：处理其他可迭代类型（如列表、字典等）
+    if hasattr(value, "__len__"):
+        return len(value) == 0
+    # 默认情况下，非 None、非空类型返回 False
+    return False

re_common/v2/baselibrary/utils/string_clear.py CHANGED Viewed

@@ -66,6 +66,11 @@ class StringClear(object):
         self.obj_str = re.sub("[_]", "", self.obj_str)
         return self
+    def replace_dash_with_space(self):
+        # 横线换成空格 比 去除符号有时更有用
+        self.obj_str = self.obj_str.replace("-", " ")
+        return self
     def remove_diacritics(self):
         # 去除音标 转换成字母
         self.obj_str = get_diacritic_variant(self.obj_str)
@@ -81,6 +86,26 @@ class StringClear(object):
         self.obj_str = re.sub("\\(.*?\\)", "", self.obj_str)
         return self
+    def remove_html_tag(self):
+        import html
+        from parsel import Selector
+        self.obj_str = html.unescape(self.obj_str)
+        def clean_html(html):
+            sel = Selector(text=html)
+            # 移除脚本和样式
+            sel.xpath('//script').remove()
+            sel.xpath('//style').remove()
+            # 提取文本
+            text_nodes = sel.xpath('//text()').getall()
+            return ''.join(t.strip() for t in text_nodes if t.strip())
+        self.obj_str = clean_html(self.obj_str)
+        return self
     def get_str(self):
         return self.obj_str
@@ -89,8 +114,10 @@ def rel_clear(str_obj):
     # 为融合数据定制的 清理规则
     return (StringClear(str_obj)
             .None_to_str()  # 空对象转str 防止空对象
-            .to_str()  # 防止其他类型传入
+            .to_str()  # 防止其他类型传入 比如 int double
             .qj_to_bj()  # 全角转半角
+            .remove_html_tag() # html标签清理
+            .replace_dash_with_space()  # 横线转空格 在 英文 title 中更有用
             .remove_special_chars()  # 移除特殊字符，仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
             .collapse_spaces()  # 移除多余空格,连续多个空格变一个
             .lower()  # 小写

re_common/v2/baselibrary/utils/stringutils.py CHANGED Viewed

@@ -63,6 +63,7 @@ def get_diacritic_variant(char1):
     return base_char1
 def get_alphabetic_ratio(text: str) -> float:
+    # 返回字母型字符所占比例
     if not text:
         return 0

{re_common-10.0.0.dist-info → re_common-10.0.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: re_common
-Version: 10.0.0
+Version: 10.0.2
 Summary: a library about all python projects
 Home-page: https://gitee.com/xujiangios/re-common
 Author: vic

{re_common-10.0.0.dist-info → re_common-10.0.2.dist-info}/RECORD RENAMED Viewed

@@ -171,15 +171,15 @@ re_common/v2/baselibrary/tools/list_tools.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
 re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=o-PNjmcYDkfyiR75Jci_9sSn4cGi_F9jPCIrwYdnb1U,1013
 re_common/v2/baselibrary/tools/text_matcher.py,sha256=F4WtLO-b7H6V9TIvOntCD9ZXSQP_KijPuLLYcLPtrKQ,7021
 re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
-re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=dxrWO800wElZM_4aKolUHSPBYZlxqzXukE4M-LZ13jA,2644
+re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=tzjVr_-6iPAKTt14hR-BhRshdRgeT_MPJpUQkxcTXns,4084
 re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-re_common/v2/baselibrary/utils/author_smi.py,sha256=_P3I5JXvxHqNNWUwhAyHiJuBFiC0tXvGD8-_HxNiuEU,11051
+re_common/v2/baselibrary/utils/author_smi.py,sha256=Mjl0GYH9e0TP48yxnxC7qgMP2bZW04pa8TQezpKo9L0,11796
 re_common/v2/baselibrary/utils/basedict.py,sha256=tSV85pARe8ZQDY77_h_heS81EWwcgJW076DcA9WQyjY,1161
 re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
 re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
-re_common/v2/baselibrary/utils/string_bool.py,sha256=f5qYdKvTufxmfSsxXN41WFLV--vCwDWU2LeQPbDvKZY,178
-re_common/v2/baselibrary/utils/string_clear.py,sha256=LqGvv-UZnsVwiDBN3-PdzDUTfWlAsKsvKlkXqySI0eE,3244
-re_common/v2/baselibrary/utils/stringutils.py,sha256=lhDvRL60S6gjhU4D0nfk2Y-c25IyYdYOD0TMoCx-huE,2658
+re_common/v2/baselibrary/utils/string_bool.py,sha256=4VCr1g8pX5YnzZSKctQgQfmhSQ0aw7a8ruhWdiRmBFU,641
+re_common/v2/baselibrary/utils/string_clear.py,sha256=R3Asus3NcmL-4SVLsfhYmP7YQwB-H7iCCFPnl9eKO7A,4157
+re_common/v2/baselibrary/utils/stringutils.py,sha256=quAgCdW_ayQwY4AqnZZkZ4NlcSEcy6f1arOVSeP2vEo,2699
 re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
 re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
@@ -206,8 +206,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
 re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
 re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
 re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
-re_common-10.0.0.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
-re_common-10.0.0.dist-info/METADATA,sha256=C8xtx6EWq_g7ScVYYKNZRwq7IuZ_z2esfPwhztPshE0,581
-re_common-10.0.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-re_common-10.0.0.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
-re_common-10.0.0.dist-info/RECORD,,
+re_common-10.0.2.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
+re_common-10.0.2.dist-info/METADATA,sha256=oVKxavSnd8Vne03NpymV_GLIR6DXI7UfDBBBj2CgVbc,581
+re_common-10.0.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+re_common-10.0.2.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
+re_common-10.0.2.dist-info/RECORD,,

{re_common-10.0.0.dist-info → re_common-10.0.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{re_common-10.0.0.dist-info → re_common-10.0.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{re_common-10.0.0.dist-info → re_common-10.0.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

re-common 10.0.0__py3-none-any.whl → 10.0.2__py3-none-any.whl

re-common 10.0.0py3-none-any.whl → 10.0.2py3-none-any.whl