re-common 10.0.10__py3-none-any.whl → 10.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/v2/baselibrary/utils/string_clear.py +9 -1
- re_common/v2/baselibrary/utils/stringutils.py +8 -0
- {re_common-10.0.10.dist-info → re_common-10.0.11.dist-info}/METADATA +1 -1
- {re_common-10.0.10.dist-info → re_common-10.0.11.dist-info}/RECORD +7 -7
- {re_common-10.0.10.dist-info → re_common-10.0.11.dist-info}/LICENSE +0 -0
- {re_common-10.0.10.dist-info → re_common-10.0.11.dist-info}/WHEEL +0 -0
- {re_common-10.0.10.dist-info → re_common-10.0.11.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import regex
|
|
3
3
|
|
|
4
|
-
from re_common.v2.baselibrary.utils.stringutils import qj2bj, bj2qj, get_diacritic_variant, clean_html
|
|
4
|
+
from re_common.v2.baselibrary.utils.stringutils import qj2bj, bj2qj, get_diacritic_variant, clean_html, \
|
|
5
|
+
remove_spaces_between_chinese_characters
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class StringClear(object):
|
|
@@ -101,6 +102,7 @@ class StringClear(object):
|
|
|
101
102
|
return self
|
|
102
103
|
|
|
103
104
|
def remove_html_tag(self):
|
|
105
|
+
# 去除 html 标签
|
|
104
106
|
import html
|
|
105
107
|
|
|
106
108
|
self.obj_str = html.unescape(self.obj_str)
|
|
@@ -109,6 +111,11 @@ class StringClear(object):
|
|
|
109
111
|
|
|
110
112
|
return self
|
|
111
113
|
|
|
114
|
+
def remove_spaces_in_chinese_characters(self):
|
|
115
|
+
# 匹配中文间的空格并替换为空字符串
|
|
116
|
+
self.obj_str = remove_spaces_between_chinese_characters(self.obj_str)
|
|
117
|
+
return self
|
|
118
|
+
|
|
112
119
|
def get_str(self):
|
|
113
120
|
return self.obj_str
|
|
114
121
|
|
|
@@ -122,6 +129,7 @@ def rel_clear(str_obj):
|
|
|
122
129
|
.remove_html_tag() # html标签清理
|
|
123
130
|
.remove_special_chars() # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
|
|
124
131
|
.collapse_spaces() # 移除多余空格,连续多个空格变一个
|
|
132
|
+
.remove_spaces_in_chinese_characters() # 匹配中文间的空格并替换为空字符串
|
|
125
133
|
.lower() # 小写
|
|
126
134
|
.get_str() # 获取str
|
|
127
135
|
.strip()) # 去掉空格
|
|
@@ -143,4 +143,12 @@ def clean_html(html):
|
|
|
143
143
|
return parser.get_text()
|
|
144
144
|
|
|
145
145
|
|
|
146
|
+
def remove_spaces_between_chinese_characters(text):
|
|
147
|
+
"""
|
|
148
|
+
匹配中文间的空格并替换为空字符串
|
|
146
149
|
|
|
150
|
+
这里没有选取 后面的一些扩展分区 是那些分区比较分散 都写进来消耗性能,
|
|
151
|
+
认为只包含这些也够用了
|
|
152
|
+
"""
|
|
153
|
+
pattern = r'(?<=[\u3400-\u9fff])\s+(?=[\u3400-\u9fff])'
|
|
154
|
+
return re.sub(pattern, '', text)
|
|
@@ -182,8 +182,8 @@ re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5w
|
|
|
182
182
|
re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
|
|
183
183
|
re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
|
|
184
184
|
re_common/v2/baselibrary/utils/string_bool.py,sha256=EJnkSck4ofcIeJ6nLzAOVtlt6o1WBgvgVwIqJKj5Suc,2993
|
|
185
|
-
re_common/v2/baselibrary/utils/string_clear.py,sha256=
|
|
186
|
-
re_common/v2/baselibrary/utils/stringutils.py,sha256=
|
|
185
|
+
re_common/v2/baselibrary/utils/string_clear.py,sha256=pGxL9PlzQDM06sC0j6U0zYRemvsJ7-OOpfzS5ETCxAs,6258
|
|
186
|
+
re_common/v2/baselibrary/utils/stringutils.py,sha256=watvMwx8gzEj0Swz7e1cFUUQE1UkN81Fw-Hkjs4l8lo,4233
|
|
187
187
|
re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
188
188
|
re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
|
|
189
189
|
re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
|
|
@@ -210,8 +210,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
|
|
|
210
210
|
re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
|
|
211
211
|
re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
|
|
212
212
|
re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
|
|
213
|
-
re_common-10.0.
|
|
214
|
-
re_common-10.0.
|
|
215
|
-
re_common-10.0.
|
|
216
|
-
re_common-10.0.
|
|
217
|
-
re_common-10.0.
|
|
213
|
+
re_common-10.0.11.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
214
|
+
re_common-10.0.11.dist-info/METADATA,sha256=5g6SC3mrd2cryFaMmajqme2KGUoyoEkoDDwtqGeCYso,582
|
|
215
|
+
re_common-10.0.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
216
|
+
re_common-10.0.11.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
|
|
217
|
+
re_common-10.0.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|