re-common 10.0.9__py3-none-any.whl → 10.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,8 @@ from typing import List
3
3
  import jieba
4
4
  from datasketch import MinHash, minhash
5
5
 
6
+ from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
7
+
6
8
 
7
9
  def tokenize(text: str, stopwords=None) -> List[str]:
8
10
  """
@@ -11,8 +13,16 @@ def tokenize(text: str, stopwords=None) -> List[str]:
11
13
  if stopwords is None:
12
14
  stopwords = []
13
15
  words = jieba.lcut(text)
16
+
14
17
  # 统计单字符数据 长度,防止结巴分词分不了的单词 将数据分为单个字符
15
- one_char_size = len([i for i in words if len(i) == 1])
18
+
19
+ # 这里为什么使用函数 而不是在推导式中兼容,主要是在一些 spark中 推导式的if 条件不遵循最短路径原则会将表达式当做一个整体算子
20
+ def is_singel_en(i):
21
+ if len(i) == 1 and not is_single_cjk_char(i):
22
+ return True
23
+ return False
24
+
25
+ one_char_size = len([i for i in words if is_singel_en(i)])
16
26
  all_size = len(words)
17
27
  # 如果单字符个数超过一定比例 就直接用空格分词
18
28
  if all_size != 0 and one_char_size / all_size > 0.6:
@@ -36,11 +36,55 @@ def is_empty(value):
36
36
  if isinstance(value, str):
37
37
  return value.strip() == ""
38
38
 
39
-
40
-
41
39
  # 处理其他可迭代类型(如列表、字典等)
42
40
  if hasattr(value, "__len__"):
43
41
  return len(value) == 0
44
42
 
45
43
  # 默认情况下,非 None、非空类型返回 False
46
- return False
44
+ return False
45
+
46
+
47
+ class InvalidCharLengthError(Exception):
48
+ """自定义异常类,用于处理输入字符长度不为 1 的情况"""
49
+ pass
50
+
51
+
52
+ def is_single_cjk_char(char):
53
+ """
54
+ 判断单个字符是否为中日韩字符
55
+ :param char: 要判断的单个字符
56
+ :return: 如果是中日韩字符返回 True,否则返回 False
57
+ """
58
+ # 检查输入字符的长度
59
+ if len(char) != 1:
60
+ raise InvalidCharLengthError("输入的字符串长度必须为 1,请提供单个字符进行判断。")
61
+ code_point = ord(char)
62
+ # 中日韩统一表意文字
63
+ ranges = [
64
+ (0x4E00, 0x9FFF), # CJK 统一表意符号
65
+ (0x3400, 0x4DBF), # CJK 统一表意符号扩展 A
66
+ (0x20000, 0x2A6DF), # CJK 统一表意符号扩展 B
67
+ (0x2A700, 0x2B73F), # CJK 统一表意符号扩展 C
68
+ (0x2B740, 0x2B81F), # CJK 统一表意符号扩展 D
69
+ (0x2B820, 0x2CEAF), # CJK 统一表意符号扩展 E
70
+ (0x2CEB0, 0x2EBEF), # CJK 统一表意符号扩展 F
71
+ (0x30000, 0x3134F), # CJK 统一表意符号扩展 G
72
+ (0x31350, 0x323AF), # CJK 统一表意符号扩展 H
73
+ (0x3300, 0x33FF), # CJK 兼容符号
74
+ (0xFE30, 0xFE4F), # CJK 兼容形式
75
+ (0xF900, 0xFAFF), # CJK 兼容表意符号
76
+ (0x2F800, 0x2FA1F), # CJK 兼容表意符号补充
77
+ (0x3105, 0x3129), # 注音字母
78
+ (0x31A0, 0x31BF), # 注音字母扩展
79
+ (0x3040, 0x309F), # 平假名
80
+ (0x30A0, 0x30FF), # 片假名
81
+ (0x31F0, 0x31FF), # 片假名扩展
82
+ (0xAC00, 0xD7AF), # 韩文音节
83
+ (0x1100, 0x11FF), # 韩文字母
84
+ (0xA960, 0xA97F), # 韩文字母扩展 A
85
+ (0xD7B0, 0xD7FF), # 韩文字母扩展 B
86
+ ]
87
+ for start, end in ranges:
88
+ if start <= code_point <= end:
89
+ return True
90
+ return False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 10.0.9
3
+ Version: 10.0.10
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic
@@ -171,7 +171,7 @@ re_common/v2/baselibrary/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
171
171
  re_common/v2/baselibrary/tools/dict_tools.py,sha256=BTh7oJuJ619IZgxiYlim0ltrXBclDtb7WzyFGr7wVf0,1246
172
172
  re_common/v2/baselibrary/tools/dolphinscheduler.py,sha256=1m7UGYDiuvJUCI6ik6CGM2fO8U5XteJzn55VRbwB9ts,7978
173
173
  re_common/v2/baselibrary/tools/list_tools.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
174
- re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=GfZf_zFgEMm6DO0w7d70Fzv1iKzq0WqBTMEfzjEuBAw,1292
174
+ re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=d_h9j7VxiXpcn1GHZ7L2tpx9_LDQshcl58tlKvSxZPg,1691
175
175
  re_common/v2/baselibrary/tools/text_matcher.py,sha256=F4WtLO-b7H6V9TIvOntCD9ZXSQP_KijPuLLYcLPtrKQ,7021
176
176
  re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
177
177
  re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=tzjVr_-6iPAKTt14hR-BhRshdRgeT_MPJpUQkxcTXns,4084
@@ -181,7 +181,7 @@ re_common/v2/baselibrary/utils/basedict.py,sha256=tSV85pARe8ZQDY77_h_heS81EWwcgJ
181
181
  re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
182
182
  re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
183
183
  re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
184
- re_common/v2/baselibrary/utils/string_bool.py,sha256=tR9JrZuuBxz7oDgSpndKAeer0BYYFrhSxikUsNkHUeg,1099
184
+ re_common/v2/baselibrary/utils/string_bool.py,sha256=EJnkSck4ofcIeJ6nLzAOVtlt6o1WBgvgVwIqJKj5Suc,2993
185
185
  re_common/v2/baselibrary/utils/string_clear.py,sha256=LDIf-3Czq1sXp-54aifXdXbdGUX7hpFBKqQa5Azj_lo,5861
186
186
  re_common/v2/baselibrary/utils/stringutils.py,sha256=GLXHAm8IulC_8hWrN2aiFQjsoOpjczvcVozmTJj86-A,3864
187
187
  re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -210,8 +210,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
210
210
  re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
211
211
  re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
212
212
  re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
213
- re_common-10.0.9.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
214
- re_common-10.0.9.dist-info/METADATA,sha256=7EH3E_6nA_nQ7s190Qrc4ylAMGaywO-s_z79css2utM,581
215
- re_common-10.0.9.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
216
- re_common-10.0.9.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
217
- re_common-10.0.9.dist-info/RECORD,,
213
+ re_common-10.0.10.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
214
+ re_common-10.0.10.dist-info/METADATA,sha256=mOarqqiMSzMjAcu1sV0OxUGdwfANLray_3ZpjkAPxFg,582
215
+ re_common-10.0.10.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
216
+ re_common-10.0.10.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
217
+ re_common-10.0.10.dist-info/RECORD,,