re-common 10.0.8__py3-none-any.whl → 10.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/v2/baselibrary/tools/dict_tools.py +14 -1
- re_common/v2/baselibrary/tools/search_hash_tools.py +11 -1
- re_common/v2/baselibrary/utils/string_bool.py +21 -1
- re_common/v2/baselibrary/utils/string_clear.py +8 -7
- {re_common-10.0.8.dist-info → re_common-10.0.9.dist-info}/METADATA +1 -1
- {re_common-10.0.8.dist-info → re_common-10.0.9.dist-info}/RECORD +9 -9
- {re_common-10.0.8.dist-info → re_common-10.0.9.dist-info}/LICENSE +0 -0
- {re_common-10.0.8.dist-info → re_common-10.0.9.dist-info}/WHEEL +0 -0
- {re_common-10.0.8.dist-info → re_common-10.0.9.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,7 @@ class DotDict(dict):
|
|
|
2
2
|
"""
|
|
3
3
|
让字典成为对象 既可以用字典方式访问 也可以用点访问key
|
|
4
4
|
"""
|
|
5
|
+
|
|
5
6
|
def __init__(self, *args, **kwargs):
|
|
6
7
|
super().__init__(*args, **kwargs)
|
|
7
8
|
# 递归地将嵌套字典转换为 DotDict
|
|
@@ -21,4 +22,16 @@ class DotDict(dict):
|
|
|
21
22
|
def __setattr__(self, key, value):
|
|
22
23
|
if isinstance(value, dict): # 如果值是字典,转换为 DotDict
|
|
23
24
|
value = DotDict(value)
|
|
24
|
-
self[key] = value
|
|
25
|
+
self[key] = value
|
|
26
|
+
|
|
27
|
+
def to_dict(self):
|
|
28
|
+
"""
|
|
29
|
+
将 DotDict 实例转换为普通字典
|
|
30
|
+
"""
|
|
31
|
+
result = {}
|
|
32
|
+
for key, value in self.items():
|
|
33
|
+
if isinstance(value, DotDict):
|
|
34
|
+
result[key] = value.to_dict()
|
|
35
|
+
else:
|
|
36
|
+
result[key] = value
|
|
37
|
+
return result
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
|
|
3
3
|
import jieba
|
|
4
|
-
from datasketch import MinHash
|
|
4
|
+
from datasketch import MinHash, minhash
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
def tokenize(text: str, stopwords=None) -> List[str]:
|
|
@@ -31,3 +31,13 @@ def create_minhash(words: List[str], num_perm=128) -> MinHash:
|
|
|
31
31
|
for word in words:
|
|
32
32
|
minhash.update(word.encode("utf-8"))
|
|
33
33
|
return minhash
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_str_minhash(title):
|
|
37
|
+
from re_common.v2.baselibrary.utils.string_clear import rel_clear
|
|
38
|
+
rel_title = rel_clear(title)
|
|
39
|
+
if not rel_title:
|
|
40
|
+
return ""
|
|
41
|
+
words = tokenize(rel_title)
|
|
42
|
+
minhash = create_minhash(words)
|
|
43
|
+
return minhash
|
|
@@ -10,15 +10,35 @@ def contains_chinese_chars(s):
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
def is_empty(value):
|
|
13
|
+
"""
|
|
14
|
+
判断一个值是否为空。
|
|
15
|
+
|
|
16
|
+
支持的类型:
|
|
17
|
+
- None
|
|
18
|
+
- 空字符串(去除空白后)
|
|
19
|
+
- pandas 的 NaN
|
|
20
|
+
- 其他可迭代类型(如列表、字典等)的长度为 0
|
|
21
|
+
- 其他情况返回 False
|
|
22
|
+
"""
|
|
13
23
|
# 如果是 None,直接返回 True
|
|
14
24
|
if value is None:
|
|
15
25
|
return True
|
|
16
26
|
|
|
27
|
+
# 尝试处理 pandas 的 NaN
|
|
28
|
+
try:
|
|
29
|
+
import pandas as pd
|
|
30
|
+
if pd.isna(value):
|
|
31
|
+
return True
|
|
32
|
+
except ImportError:
|
|
33
|
+
pass # 如果没有安装 pandas,跳过
|
|
34
|
+
|
|
17
35
|
# 如果是字符串,检查去除空白后是否为空
|
|
18
36
|
if isinstance(value, str):
|
|
19
37
|
return value.strip() == ""
|
|
20
38
|
|
|
21
|
-
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# 处理其他可迭代类型(如列表、字典等)
|
|
22
42
|
if hasattr(value, "__len__"):
|
|
23
43
|
return len(value) == 0
|
|
24
44
|
|
|
@@ -165,10 +165,11 @@ def ref_clear(str_obj):
|
|
|
165
165
|
def clear_obj(str_obj):
|
|
166
166
|
# 为对象化定制的清理
|
|
167
167
|
str_obj = clear_au_organ(str_obj)
|
|
168
|
-
str_obj = str_obj.replace("ß", "SS")
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
168
|
+
# str_obj = str_obj.replace("ß", "SS") # "ß" 的 大写就是 "SS"
|
|
169
|
+
result = (StringClear(str_obj)
|
|
170
|
+
.remove_diacritics() # 清理音标
|
|
171
|
+
.upper()
|
|
172
|
+
.get_str() # 获取str
|
|
173
|
+
.strip() # 去掉空格
|
|
174
|
+
)
|
|
175
|
+
return result
|
|
@@ -168,10 +168,10 @@ re_common/v2/baselibrary/s3object/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
|
|
|
168
168
|
re_common/v2/baselibrary/s3object/baseboto3.py,sha256=mXuIFx99pnrPGQ4LJCZwlN1HLbaU-OWLwck0cVzW6hc,11203
|
|
169
169
|
re_common/v2/baselibrary/tools/WeChatRobot.py,sha256=EaQgNncROAhU5-psYRGWAshIV5aEw-p2u1kYLpvr7RA,2796
|
|
170
170
|
re_common/v2/baselibrary/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
171
|
-
re_common/v2/baselibrary/tools/dict_tools.py,sha256=
|
|
171
|
+
re_common/v2/baselibrary/tools/dict_tools.py,sha256=BTh7oJuJ619IZgxiYlim0ltrXBclDtb7WzyFGr7wVf0,1246
|
|
172
172
|
re_common/v2/baselibrary/tools/dolphinscheduler.py,sha256=1m7UGYDiuvJUCI6ik6CGM2fO8U5XteJzn55VRbwB9ts,7978
|
|
173
173
|
re_common/v2/baselibrary/tools/list_tools.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
174
|
-
re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=
|
|
174
|
+
re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=GfZf_zFgEMm6DO0w7d70Fzv1iKzq0WqBTMEfzjEuBAw,1292
|
|
175
175
|
re_common/v2/baselibrary/tools/text_matcher.py,sha256=F4WtLO-b7H6V9TIvOntCD9ZXSQP_KijPuLLYcLPtrKQ,7021
|
|
176
176
|
re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
|
|
177
177
|
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=tzjVr_-6iPAKTt14hR-BhRshdRgeT_MPJpUQkxcTXns,4084
|
|
@@ -181,8 +181,8 @@ re_common/v2/baselibrary/utils/basedict.py,sha256=tSV85pARe8ZQDY77_h_heS81EWwcgJ
|
|
|
181
181
|
re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
|
|
182
182
|
re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
|
|
183
183
|
re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
|
|
184
|
-
re_common/v2/baselibrary/utils/string_bool.py,sha256=
|
|
185
|
-
re_common/v2/baselibrary/utils/string_clear.py,sha256=
|
|
184
|
+
re_common/v2/baselibrary/utils/string_bool.py,sha256=tR9JrZuuBxz7oDgSpndKAeer0BYYFrhSxikUsNkHUeg,1099
|
|
185
|
+
re_common/v2/baselibrary/utils/string_clear.py,sha256=LDIf-3Czq1sXp-54aifXdXbdGUX7hpFBKqQa5Azj_lo,5861
|
|
186
186
|
re_common/v2/baselibrary/utils/stringutils.py,sha256=GLXHAm8IulC_8hWrN2aiFQjsoOpjczvcVozmTJj86-A,3864
|
|
187
187
|
re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
188
188
|
re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
|
|
@@ -210,8 +210,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
|
|
|
210
210
|
re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
|
|
211
211
|
re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
|
|
212
212
|
re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
|
|
213
|
-
re_common-10.0.
|
|
214
|
-
re_common-10.0.
|
|
215
|
-
re_common-10.0.
|
|
216
|
-
re_common-10.0.
|
|
217
|
-
re_common-10.0.
|
|
213
|
+
re_common-10.0.9.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
214
|
+
re_common-10.0.9.dist-info/METADATA,sha256=7EH3E_6nA_nQ7s190Qrc4ylAMGaywO-s_z79css2utM,581
|
|
215
|
+
re_common-10.0.9.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
216
|
+
re_common-10.0.9.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
|
|
217
|
+
re_common-10.0.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|