re-common 10.0.27__py3-none-any.whl → 10.0.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -76,11 +76,12 @@ class BaseLngid(object):
76
76
 
77
77
  return lngid
78
78
 
79
- def getDoiid(self, doi):
80
- doi_upper = doi.upper()
81
- limited_id = self.BaseEncodeID(doi_upper)
79
+ def getDoiid(self, doi, case_insensitive=False):
80
+ if case_insensitive is False:
81
+ doi = doi.upper()
82
+ limited_id = self.BaseEncodeID(doi)
82
83
  if len(limited_id) > 240:
83
- limited_id = hashlib.md5(doi_upper.encode('utf8')).hexdigest().upper()
84
+ limited_id = hashlib.md5(doi.encode('utf8')).hexdigest().upper()
84
85
  return limited_id
85
86
 
86
87
  def GetRawid(self, limited_id, case_insensitive=False):
@@ -97,4 +98,3 @@ class BaseLngid(object):
97
98
  raise Exception("长度超过20,不可逆")
98
99
 
99
100
  return uppercase_rawid
100
-
@@ -185,6 +185,7 @@ def AuthorRatio(
185
185
  # 处理后是否为空字符串,如果有 返回0
186
186
  if not s1 or not s2:
187
187
  return 0
188
+ # get_diacritic_variant(unidecode(strs)) 更激进,会丢失非拉丁字符和原文信息,适合需要把多语言文本转换成 ASCII 拼音的场景。
188
189
  # 处理音标问题
189
190
  s1 = get_diacritic_variant(unidecode(s1))
190
191
  s2 = get_diacritic_variant(unidecode(s2))
@@ -9,7 +9,7 @@ from re_common.v2.baselibrary.utils.stringutils import (
9
9
  bj2qj,
10
10
  get_diacritic_variant,
11
11
  clean_html,
12
- remove_spaces_between_chinese_characters, clean_unicode_alnum,
12
+ remove_spaces_between_chinese_characters, clean_unicode_alnum, normalize_nfkc,
13
13
  )
14
14
 
15
15
 
@@ -112,10 +112,18 @@ class StringClear(object):
112
112
  return self
113
113
 
114
114
  def remove_diacritics(self):
115
+ """
116
+ 和 clear_nkfc的关键区别 不去除连字
117
+ """
115
118
  # 去除音标 转换成字母
116
119
  self.obj_str = get_diacritic_variant(self.obj_str)
117
120
  return self
118
121
 
122
+ def clear_nkfc(self):
123
+ self.obj_str = normalize_nfkc(self.obj_str)
124
+ return self
125
+
126
+
119
127
  def remove_brackets(self):
120
128
  # 移除 方括号里面的内容
121
129
  self.obj_str = re.sub("\\[.*?]", "", self.obj_str)
@@ -148,6 +156,12 @@ class StringClear(object):
148
156
  self.obj_str = unquote(self.obj_str)
149
157
  return self
150
158
 
159
+ def ascii_text(self):
160
+ # 只保留 ASCII 范围内的可见字符:空格(32) 到 ~ (126)
161
+ self.obj_str = ''.join(c for c in self.obj_str if 32 <= ord(c) <= 126)
162
+ return self
163
+
164
+
151
165
  def get_str(self):
152
166
  return self.obj_str
153
167
 
@@ -6,6 +6,7 @@ import regex
6
6
  import unicodedata
7
7
  from html.parser import HTMLParser
8
8
 
9
+ from unidecode import unidecode
9
10
 
10
11
  from re_common.v2.baselibrary.utils.string_smi import JaroDamerauLevenshteinMaxSim
11
12
 
@@ -59,7 +60,27 @@ def qj2bj(src):
59
60
  return ''.join(buf)
60
61
 
61
62
 
63
+ """
64
+ 总结对比表
65
+ 规范名 处理步骤 组合方式 兼容性归一化 主要用途
66
+ NFC 规范分解 → 规范组合 组合 否 保留预组合字符,文本呈现和存储
67
+ NFD 规范分解 不组合 否 拆解字符,便于逐字符处理
68
+ NFKC 兼容性分解 → 规范组合 组合 是 消除兼容差异,文本比较和索引
69
+ NFKD 兼容性分解 → 规范分解 不组合 是 最大程度拆解,文本分析和预处理
70
+ """
71
+
72
+
62
73
  def get_diacritic_variant(char1):
74
+ """
75
+ NFD: 规范分解(Normalization Form D)
76
+ 把字符拆分为基本字符 + 变音符号
77
+
78
+ 但不处理兼容字符(如连字)
79
+
80
+ print(unicodedata.normalize('NFD', 'é')) # 输出: 'é'(e + 组合符号) # 这里看起来是1个字符 len 其实是2
81
+ print(unicodedata.normalize('NFD', 'fl')) # 输出: 'fl'(不变化)
82
+
83
+ """
63
84
  # 将字符转换为标准的 Unicode 形式
64
85
  normalized_char1 = unicodedata.normalize('NFD', char1)
65
86
 
@@ -70,6 +91,19 @@ def get_diacritic_variant(char1):
70
91
  return base_char1
71
92
 
72
93
 
94
+ def normalize_nfkc(strs: str) -> str:
95
+ """
96
+ NFKC: 兼容字符归一化 + 组合(Normalization Form Compatibility Composition)
97
+ 把 连字、圈数字、全角字符 等兼容字符转换为标准形式
98
+
99
+ 同时做字符合并(例如 é 不再是 e+´,而是一个字符)
100
+ print(unicodedata.normalize('NFKC', 'fl')) # 输出: 'fl'
101
+ print(unicodedata.normalize('NFKC', '①')) # 输出: '1'
102
+ print(unicodedata.normalize('NFKC', 'A')) # 输出: 'A'
103
+ """
104
+ return unicodedata.normalize('NFKC', strs.strip())
105
+
106
+
73
107
  def get_alphabetic_ratio(text: str) -> float:
74
108
  # 返回字母型字符所占比例
75
109
  if not text:
@@ -158,9 +192,9 @@ def remove_spaces_between_chinese_characters(text):
158
192
  return re.sub(pattern, '', text)
159
193
 
160
194
 
161
-
162
195
  sim_utils = JaroDamerauLevenshteinMaxSim()
163
196
 
197
+
164
198
  def group_similar_texts(texts, threshold=0.9):
165
199
  """根据相似度对文本进行分组"""
166
200
  from re_common.v2.baselibrary.utils.string_clear import rel_clear
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 10.0.27
3
+ Version: 10.0.28
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic
@@ -165,7 +165,7 @@ re_common/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
165
165
  re_common/v2/baselibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
166
166
  re_common/v2/baselibrary/business_utils/BusinessStringUtil.py,sha256=njPcRgeBWpnZr5u2cPAO4qdWBq-CgTn99rJuvWFcChk,6788
167
167
  re_common/v2/baselibrary/business_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
168
- re_common/v2/baselibrary/business_utils/baseencodeid.py,sha256=J2Gs_F3-aAwIYjO06T1IIKHE6jen3mFfQVoVW3HywyI,3548
168
+ re_common/v2/baselibrary/business_utils/baseencodeid.py,sha256=3f52e0jtgCFzPEyReia8TupwiE64t_VyBT-a7uQCXAY,3595
169
169
  re_common/v2/baselibrary/business_utils/full_doi_path.py,sha256=PaMIrgDWWt_fzSFyvvDD-8CcYZJTNo6Pj-uR0WafNbY,3319
170
170
  re_common/v2/baselibrary/business_utils/rel_tools.py,sha256=LfnGFCkUSxg1SHvOMOQdP1PiHxIKqk7Syuk5YYpjJag,295
171
171
  re_common/v2/baselibrary/decorators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -191,7 +191,7 @@ re_common/v2/baselibrary/tools/data_processer/data_writer.py,sha256=OgKZ06zRJYNx
191
191
  re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=njPcRgeBWpnZr5u2cPAO4qdWBq-CgTn99rJuvWFcChk,6788
192
192
  re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
193
193
  re_common/v2/baselibrary/utils/api_net_utils.py,sha256=22q3WMWiKVg1IVGr4y2D1JrjhnbQtlChRDJm2S8rGlc,9868
194
- re_common/v2/baselibrary/utils/author_smi.py,sha256=1ebH3AHv19jtJWdlqNdwu6t58HNVLCotuCB6ff1SWiw,13666
194
+ re_common/v2/baselibrary/utils/author_smi.py,sha256=Fh276u7za-GF_tK9xpBFYF5q1E3_tX22ZouWC8U7w8o,13831
195
195
  re_common/v2/baselibrary/utils/base_string_similarity.py,sha256=a40a79ttwoG_gC_hxMNB-sMXXecgICoRDWrj0DW8iEE,7749
196
196
  re_common/v2/baselibrary/utils/basedict.py,sha256=sH3_RZ8u4649-jX2V1uKNNkjJVUijZBDp6SdqncOZ88,1583
197
197
  re_common/v2/baselibrary/utils/basehdfs.py,sha256=TPwFct_-UrmO1KCbo4gpV77rsnlCQDumNBbQKL0ZI9o,5953
@@ -202,9 +202,9 @@ re_common/v2/baselibrary/utils/json_cls.py,sha256=M93piYtmgm_wP8E57culTrd_AhHLoG
202
202
  re_common/v2/baselibrary/utils/mq.py,sha256=UHpO8iNIHs91Tgp-BgnSUpZwjWquxrGLdpr3FMMv2zw,2858
203
203
  re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
204
204
  re_common/v2/baselibrary/utils/string_bool.py,sha256=vxnjSFOfuHWGxkqaIbUNn21opx5tfV1uCXSahFfp1mU,6197
205
- re_common/v2/baselibrary/utils/string_clear.py,sha256=ywYR1KrKQyeM-zJgvTmORlfgbLdRSjWWKPe7K8oRx_8,7450
205
+ re_common/v2/baselibrary/utils/string_clear.py,sha256=Ympa0Cs2y_72QeeyMS8de8y_QgtEFJJQ0AgHnylbMUc,7861
206
206
  re_common/v2/baselibrary/utils/string_smi.py,sha256=cU0WAWHRGnGoVQx3eCEKeM_q_olFNzRTJe7rSe586SY,741
207
- re_common/v2/baselibrary/utils/stringutils.py,sha256=TI6fw3km1l25ufXrnG6ha8dSBDtRh-MF4nWRt9u8Xbo,6452
207
+ re_common/v2/baselibrary/utils/stringutils.py,sha256=eeuQYgXkWJ9apvyrYPcCCU3biTY9nD1KHos4_1ESNJE,7883
208
208
  re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
209
209
  re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
210
210
  re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
@@ -231,8 +231,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
231
231
  re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
232
232
  re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
233
233
  re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
234
- re_common-10.0.27.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
235
- re_common-10.0.27.dist-info/METADATA,sha256=P-xatKifINopvPpQFudSbxCKEpgqzLyw5Pesn-7_VvU,582
236
- re_common-10.0.27.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
237
- re_common-10.0.27.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
238
- re_common-10.0.27.dist-info/RECORD,,
234
+ re_common-10.0.28.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
235
+ re_common-10.0.28.dist-info/METADATA,sha256=6LyRvl5fLSmKd4qNyZFc72DkO8_hyJ6FQ27dba4PEvc,582
236
+ re_common-10.0.28.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
237
+ re_common-10.0.28.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
238
+ re_common-10.0.28.dist-info/RECORD,,