re-common 10.0.9__py3-none-any.whl → 10.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,8 @@ from typing import List
3
3
  import jieba
4
4
  from datasketch import MinHash, minhash
5
5
 
6
+ from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
7
+
6
8
 
7
9
  def tokenize(text: str, stopwords=None) -> List[str]:
8
10
  """
@@ -11,8 +13,16 @@ def tokenize(text: str, stopwords=None) -> List[str]:
11
13
  if stopwords is None:
12
14
  stopwords = []
13
15
  words = jieba.lcut(text)
16
+
14
17
  # 统计单字符数据 长度,防止结巴分词分不了的单词 将数据分为单个字符
15
- one_char_size = len([i for i in words if len(i) == 1])
18
+
19
+ # 这里为什么使用函数 而不是在推导式中兼容,主要是在一些 spark中 推导式的if 条件不遵循最短路径原则会将表达式当做一个整体算子
20
+ def is_singel_en(i):
21
+ if len(i) == 1 and not is_single_cjk_char(i):
22
+ return True
23
+ return False
24
+
25
+ one_char_size = len([i for i in words if is_singel_en(i)])
16
26
  all_size = len(words)
17
27
  # 如果单字符个数超过一定比例 就直接用空格分词
18
28
  if all_size != 0 and one_char_size / all_size > 0.6:
@@ -36,11 +36,55 @@ def is_empty(value):
36
36
  if isinstance(value, str):
37
37
  return value.strip() == ""
38
38
 
39
-
40
-
41
39
  # 处理其他可迭代类型(如列表、字典等)
42
40
  if hasattr(value, "__len__"):
43
41
  return len(value) == 0
44
42
 
45
43
  # 默认情况下,非 None、非空类型返回 False
46
- return False
44
+ return False
45
+
46
+
47
+ class InvalidCharLengthError(Exception):
48
+ """自定义异常类,用于处理输入字符长度不为 1 的情况"""
49
+ pass
50
+
51
+
52
+ def is_single_cjk_char(char):
53
+ """
54
+ 判断单个字符是否为中日韩字符
55
+ :param char: 要判断的单个字符
56
+ :return: 如果是中日韩字符返回 True,否则返回 False
57
+ """
58
+ # 检查输入字符的长度
59
+ if len(char) != 1:
60
+ raise InvalidCharLengthError("输入的字符串长度必须为 1,请提供单个字符进行判断。")
61
+ code_point = ord(char)
62
+ # 中日韩统一表意文字
63
+ ranges = [
64
+ (0x4E00, 0x9FFF), # CJK 统一表意符号
65
+ (0x3400, 0x4DBF), # CJK 统一表意符号扩展 A
66
+ (0x20000, 0x2A6DF), # CJK 统一表意符号扩展 B
67
+ (0x2A700, 0x2B73F), # CJK 统一表意符号扩展 C
68
+ (0x2B740, 0x2B81F), # CJK 统一表意符号扩展 D
69
+ (0x2B820, 0x2CEAF), # CJK 统一表意符号扩展 E
70
+ (0x2CEB0, 0x2EBEF), # CJK 统一表意符号扩展 F
71
+ (0x30000, 0x3134F), # CJK 统一表意符号扩展 G
72
+ (0x31350, 0x323AF), # CJK 统一表意符号扩展 H
73
+ (0x3300, 0x33FF), # CJK 兼容符号
74
+ (0xFE30, 0xFE4F), # CJK 兼容形式
75
+ (0xF900, 0xFAFF), # CJK 兼容表意符号
76
+ (0x2F800, 0x2FA1F), # CJK 兼容表意符号补充
77
+ (0x3105, 0x3129), # 注音字母
78
+ (0x31A0, 0x31BF), # 注音字母扩展
79
+ (0x3040, 0x309F), # 平假名
80
+ (0x30A0, 0x30FF), # 片假名
81
+ (0x31F0, 0x31FF), # 片假名扩展
82
+ (0xAC00, 0xD7AF), # 韩文音节
83
+ (0x1100, 0x11FF), # 韩文字母
84
+ (0xA960, 0xA97F), # 韩文字母扩展 A
85
+ (0xD7B0, 0xD7FF), # 韩文字母扩展 B
86
+ ]
87
+ for start, end in ranges:
88
+ if start <= code_point <= end:
89
+ return True
90
+ return False
@@ -1,7 +1,8 @@
1
1
  import re
2
2
  import regex
3
3
 
4
- from re_common.v2.baselibrary.utils.stringutils import qj2bj, bj2qj, get_diacritic_variant, clean_html
4
+ from re_common.v2.baselibrary.utils.stringutils import qj2bj, bj2qj, get_diacritic_variant, clean_html, \
5
+ remove_spaces_between_chinese_characters
5
6
 
6
7
 
7
8
  class StringClear(object):
@@ -101,6 +102,7 @@ class StringClear(object):
101
102
  return self
102
103
 
103
104
  def remove_html_tag(self):
105
+ # 去除 html 标签
104
106
  import html
105
107
 
106
108
  self.obj_str = html.unescape(self.obj_str)
@@ -109,6 +111,11 @@ class StringClear(object):
109
111
 
110
112
  return self
111
113
 
114
+ def remove_spaces_in_chinese_characters(self):
115
+ # 匹配中文间的空格并替换为空字符串
116
+ self.obj_str = remove_spaces_between_chinese_characters(self.obj_str)
117
+ return self
118
+
112
119
  def get_str(self):
113
120
  return self.obj_str
114
121
 
@@ -122,6 +129,7 @@ def rel_clear(str_obj):
122
129
  .remove_html_tag() # html标签清理
123
130
  .remove_special_chars() # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
124
131
  .collapse_spaces() # 移除多余空格,连续多个空格变一个
132
+ .remove_spaces_in_chinese_characters() # 匹配中文间的空格并替换为空字符串
125
133
  .lower() # 小写
126
134
  .get_str() # 获取str
127
135
  .strip()) # 去掉空格
@@ -143,4 +143,12 @@ def clean_html(html):
143
143
  return parser.get_text()
144
144
 
145
145
 
146
+ def remove_spaces_between_chinese_characters(text):
147
+ """
148
+ 匹配中文间的空格并替换为空字符串
146
149
 
150
+ 这里没有选取 后面的一些扩展分区 是那些分区比较分散 都写进来消耗性能,
151
+ 认为只包含这些也够用了
152
+ """
153
+ pattern = r'(?<=[\u3400-\u9fff])\s+(?=[\u3400-\u9fff])'
154
+ return re.sub(pattern, '', text)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 10.0.9
3
+ Version: 10.0.11
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic
@@ -171,7 +171,7 @@ re_common/v2/baselibrary/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
171
171
  re_common/v2/baselibrary/tools/dict_tools.py,sha256=BTh7oJuJ619IZgxiYlim0ltrXBclDtb7WzyFGr7wVf0,1246
172
172
  re_common/v2/baselibrary/tools/dolphinscheduler.py,sha256=1m7UGYDiuvJUCI6ik6CGM2fO8U5XteJzn55VRbwB9ts,7978
173
173
  re_common/v2/baselibrary/tools/list_tools.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
174
- re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=GfZf_zFgEMm6DO0w7d70Fzv1iKzq0WqBTMEfzjEuBAw,1292
174
+ re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=d_h9j7VxiXpcn1GHZ7L2tpx9_LDQshcl58tlKvSxZPg,1691
175
175
  re_common/v2/baselibrary/tools/text_matcher.py,sha256=F4WtLO-b7H6V9TIvOntCD9ZXSQP_KijPuLLYcLPtrKQ,7021
176
176
  re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
177
177
  re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=tzjVr_-6iPAKTt14hR-BhRshdRgeT_MPJpUQkxcTXns,4084
@@ -181,9 +181,9 @@ re_common/v2/baselibrary/utils/basedict.py,sha256=tSV85pARe8ZQDY77_h_heS81EWwcgJ
181
181
  re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
182
182
  re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
183
183
  re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
184
- re_common/v2/baselibrary/utils/string_bool.py,sha256=tR9JrZuuBxz7oDgSpndKAeer0BYYFrhSxikUsNkHUeg,1099
185
- re_common/v2/baselibrary/utils/string_clear.py,sha256=LDIf-3Czq1sXp-54aifXdXbdGUX7hpFBKqQa5Azj_lo,5861
186
- re_common/v2/baselibrary/utils/stringutils.py,sha256=GLXHAm8IulC_8hWrN2aiFQjsoOpjczvcVozmTJj86-A,3864
184
+ re_common/v2/baselibrary/utils/string_bool.py,sha256=EJnkSck4ofcIeJ6nLzAOVtlt6o1WBgvgVwIqJKj5Suc,2993
185
+ re_common/v2/baselibrary/utils/string_clear.py,sha256=pGxL9PlzQDM06sC0j6U0zYRemvsJ7-OOpfzS5ETCxAs,6258
186
+ re_common/v2/baselibrary/utils/stringutils.py,sha256=watvMwx8gzEj0Swz7e1cFUUQE1UkN81Fw-Hkjs4l8lo,4233
187
187
  re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
188
188
  re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
189
189
  re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
@@ -210,8 +210,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
210
210
  re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
211
211
  re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
212
212
  re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
213
- re_common-10.0.9.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
214
- re_common-10.0.9.dist-info/METADATA,sha256=7EH3E_6nA_nQ7s190Qrc4ylAMGaywO-s_z79css2utM,581
215
- re_common-10.0.9.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
216
- re_common-10.0.9.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
217
- re_common-10.0.9.dist-info/RECORD,,
213
+ re_common-10.0.11.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
214
+ re_common-10.0.11.dist-info/METADATA,sha256=5g6SC3mrd2cryFaMmajqme2KGUoyoEkoDDwtqGeCYso,582
215
+ re_common-10.0.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
216
+ re_common-10.0.11.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
217
+ re_common-10.0.11.dist-info/RECORD,,