re-common 10.0.9__py3-none-any.whl → 10.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/v2/baselibrary/tools/search_hash_tools.py +11 -1
- re_common/v2/baselibrary/utils/string_bool.py +47 -3
- {re_common-10.0.9.dist-info → re_common-10.0.10.dist-info}/METADATA +1 -1
- {re_common-10.0.9.dist-info → re_common-10.0.10.dist-info}/RECORD +7 -7
- {re_common-10.0.9.dist-info → re_common-10.0.10.dist-info}/LICENSE +0 -0
- {re_common-10.0.9.dist-info → re_common-10.0.10.dist-info}/WHEEL +0 -0
- {re_common-10.0.9.dist-info → re_common-10.0.10.dist-info}/top_level.txt +0 -0
|
@@ -3,6 +3,8 @@ from typing import List
|
|
|
3
3
|
import jieba
|
|
4
4
|
from datasketch import MinHash, minhash
|
|
5
5
|
|
|
6
|
+
from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
|
|
7
|
+
|
|
6
8
|
|
|
7
9
|
def tokenize(text: str, stopwords=None) -> List[str]:
|
|
8
10
|
"""
|
|
@@ -11,8 +13,16 @@ def tokenize(text: str, stopwords=None) -> List[str]:
|
|
|
11
13
|
if stopwords is None:
|
|
12
14
|
stopwords = []
|
|
13
15
|
words = jieba.lcut(text)
|
|
16
|
+
|
|
14
17
|
# 统计单字符数据 长度,防止结巴分词分不了的单词 将数据分为单个字符
|
|
15
|
-
|
|
18
|
+
|
|
19
|
+
# 这里为什么使用函数 而不是在推导式中兼容,主要是在一些 spark中 推导式的if 条件不遵循最短路径原则会将表达式当做一个整体算子
|
|
20
|
+
def is_singel_en(i):
|
|
21
|
+
if len(i) == 1 and not is_single_cjk_char(i):
|
|
22
|
+
return True
|
|
23
|
+
return False
|
|
24
|
+
|
|
25
|
+
one_char_size = len([i for i in words if is_singel_en(i)])
|
|
16
26
|
all_size = len(words)
|
|
17
27
|
# 如果单字符个数超过一定比例 就直接用空格分词
|
|
18
28
|
if all_size != 0 and one_char_size / all_size > 0.6:
|
|
@@ -36,11 +36,55 @@ def is_empty(value):
|
|
|
36
36
|
if isinstance(value, str):
|
|
37
37
|
return value.strip() == ""
|
|
38
38
|
|
|
39
|
-
|
|
40
|
-
|
|
41
39
|
# 处理其他可迭代类型(如列表、字典等)
|
|
42
40
|
if hasattr(value, "__len__"):
|
|
43
41
|
return len(value) == 0
|
|
44
42
|
|
|
45
43
|
# 默认情况下,非 None、非空类型返回 False
|
|
46
|
-
return False
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class InvalidCharLengthError(Exception):
|
|
48
|
+
"""自定义异常类,用于处理输入字符长度不为 1 的情况"""
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def is_single_cjk_char(char):
|
|
53
|
+
"""
|
|
54
|
+
判断单个字符是否为中日韩字符
|
|
55
|
+
:param char: 要判断的单个字符
|
|
56
|
+
:return: 如果是中日韩字符返回 True,否则返回 False
|
|
57
|
+
"""
|
|
58
|
+
# 检查输入字符的长度
|
|
59
|
+
if len(char) != 1:
|
|
60
|
+
raise InvalidCharLengthError("输入的字符串长度必须为 1,请提供单个字符进行判断。")
|
|
61
|
+
code_point = ord(char)
|
|
62
|
+
# 中日韩统一表意文字
|
|
63
|
+
ranges = [
|
|
64
|
+
(0x4E00, 0x9FFF), # CJK 统一表意符号
|
|
65
|
+
(0x3400, 0x4DBF), # CJK 统一表意符号扩展 A
|
|
66
|
+
(0x20000, 0x2A6DF), # CJK 统一表意符号扩展 B
|
|
67
|
+
(0x2A700, 0x2B73F), # CJK 统一表意符号扩展 C
|
|
68
|
+
(0x2B740, 0x2B81F), # CJK 统一表意符号扩展 D
|
|
69
|
+
(0x2B820, 0x2CEAF), # CJK 统一表意符号扩展 E
|
|
70
|
+
(0x2CEB0, 0x2EBEF), # CJK 统一表意符号扩展 F
|
|
71
|
+
(0x30000, 0x3134F), # CJK 统一表意符号扩展 G
|
|
72
|
+
(0x31350, 0x323AF), # CJK 统一表意符号扩展 H
|
|
73
|
+
(0x3300, 0x33FF), # CJK 兼容符号
|
|
74
|
+
(0xFE30, 0xFE4F), # CJK 兼容形式
|
|
75
|
+
(0xF900, 0xFAFF), # CJK 兼容表意符号
|
|
76
|
+
(0x2F800, 0x2FA1F), # CJK 兼容表意符号补充
|
|
77
|
+
(0x3105, 0x3129), # 注音字母
|
|
78
|
+
(0x31A0, 0x31BF), # 注音字母扩展
|
|
79
|
+
(0x3040, 0x309F), # 平假名
|
|
80
|
+
(0x30A0, 0x30FF), # 片假名
|
|
81
|
+
(0x31F0, 0x31FF), # 片假名扩展
|
|
82
|
+
(0xAC00, 0xD7AF), # 韩文音节
|
|
83
|
+
(0x1100, 0x11FF), # 韩文字母
|
|
84
|
+
(0xA960, 0xA97F), # 韩文字母扩展 A
|
|
85
|
+
(0xD7B0, 0xD7FF), # 韩文字母扩展 B
|
|
86
|
+
]
|
|
87
|
+
for start, end in ranges:
|
|
88
|
+
if start <= code_point <= end:
|
|
89
|
+
return True
|
|
90
|
+
return False
|
|
@@ -171,7 +171,7 @@ re_common/v2/baselibrary/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
|
|
|
171
171
|
re_common/v2/baselibrary/tools/dict_tools.py,sha256=BTh7oJuJ619IZgxiYlim0ltrXBclDtb7WzyFGr7wVf0,1246
|
|
172
172
|
re_common/v2/baselibrary/tools/dolphinscheduler.py,sha256=1m7UGYDiuvJUCI6ik6CGM2fO8U5XteJzn55VRbwB9ts,7978
|
|
173
173
|
re_common/v2/baselibrary/tools/list_tools.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
174
|
-
re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=
|
|
174
|
+
re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=d_h9j7VxiXpcn1GHZ7L2tpx9_LDQshcl58tlKvSxZPg,1691
|
|
175
175
|
re_common/v2/baselibrary/tools/text_matcher.py,sha256=F4WtLO-b7H6V9TIvOntCD9ZXSQP_KijPuLLYcLPtrKQ,7021
|
|
176
176
|
re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
|
|
177
177
|
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=tzjVr_-6iPAKTt14hR-BhRshdRgeT_MPJpUQkxcTXns,4084
|
|
@@ -181,7 +181,7 @@ re_common/v2/baselibrary/utils/basedict.py,sha256=tSV85pARe8ZQDY77_h_heS81EWwcgJ
|
|
|
181
181
|
re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
|
|
182
182
|
re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
|
|
183
183
|
re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
|
|
184
|
-
re_common/v2/baselibrary/utils/string_bool.py,sha256=
|
|
184
|
+
re_common/v2/baselibrary/utils/string_bool.py,sha256=EJnkSck4ofcIeJ6nLzAOVtlt6o1WBgvgVwIqJKj5Suc,2993
|
|
185
185
|
re_common/v2/baselibrary/utils/string_clear.py,sha256=LDIf-3Czq1sXp-54aifXdXbdGUX7hpFBKqQa5Azj_lo,5861
|
|
186
186
|
re_common/v2/baselibrary/utils/stringutils.py,sha256=GLXHAm8IulC_8hWrN2aiFQjsoOpjczvcVozmTJj86-A,3864
|
|
187
187
|
re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -210,8 +210,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
|
|
|
210
210
|
re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
|
|
211
211
|
re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
|
|
212
212
|
re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
|
|
213
|
-
re_common-10.0.
|
|
214
|
-
re_common-10.0.
|
|
215
|
-
re_common-10.0.
|
|
216
|
-
re_common-10.0.
|
|
217
|
-
re_common-10.0.
|
|
213
|
+
re_common-10.0.10.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
214
|
+
re_common-10.0.10.dist-info/METADATA,sha256=mOarqqiMSzMjAcu1sV0OxUGdwfANLray_3ZpjkAPxFg,582
|
|
215
|
+
re_common-10.0.10.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
216
|
+
re_common-10.0.10.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
|
|
217
|
+
re_common-10.0.10.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|