re-common 10.0.22__py3-none-any.whl → 10.0.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/baselibrary/__init__.py +4 -4
- re_common/baselibrary/baseabs/__init__.py +6 -6
- re_common/baselibrary/baseabs/baseabs.py +26 -26
- re_common/baselibrary/database/mbuilder.py +132 -132
- re_common/baselibrary/database/moudle.py +93 -93
- re_common/baselibrary/database/msqlite3.py +194 -194
- re_common/baselibrary/database/mysql.py +169 -169
- re_common/baselibrary/database/sql_factory.py +26 -26
- re_common/baselibrary/mthread/MThreadingRun.py +486 -486
- re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
- re_common/baselibrary/mthread/__init__.py +2 -2
- re_common/baselibrary/mthread/mythreading.py +695 -695
- re_common/baselibrary/pakge_other/socks.py +404 -404
- re_common/baselibrary/readconfig/config_factory.py +18 -18
- re_common/baselibrary/readconfig/ini_config.py +317 -317
- re_common/baselibrary/readconfig/toml_config.py +49 -49
- re_common/baselibrary/temporary/envdata.py +36 -36
- re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
- re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
- re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
- re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
- re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
- re_common/baselibrary/tools/contrast_db3.py +123 -123
- re_common/baselibrary/tools/copy_file.py +39 -39
- re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
- re_common/baselibrary/tools/foreachgz.py +39 -39
- re_common/baselibrary/tools/get_attr.py +10 -10
- re_common/baselibrary/tools/image_to_pdf.py +61 -61
- re_common/baselibrary/tools/java_code_deal.py +139 -139
- re_common/baselibrary/tools/javacode.py +79 -79
- re_common/baselibrary/tools/mdb_db3.py +48 -48
- re_common/baselibrary/tools/merge_file.py +171 -171
- re_common/baselibrary/tools/merge_gz_file.py +165 -165
- re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
- re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
- re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
- re_common/baselibrary/tools/mongo_tools.py +50 -50
- re_common/baselibrary/tools/move_file.py +170 -170
- re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
- re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
- re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
- re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
- re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
- re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
- re_common/baselibrary/tools/myparsel.py +104 -104
- re_common/baselibrary/tools/rename_dir_file.py +37 -37
- re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
- re_common/baselibrary/tools/split_line_to_many.py +25 -25
- re_common/baselibrary/tools/stringtodicts.py +33 -33
- re_common/baselibrary/tools/workwechant_bot.py +84 -84
- re_common/baselibrary/utils/baseaiohttp.py +296 -296
- re_common/baselibrary/utils/baseaiomysql.py +87 -87
- re_common/baselibrary/utils/baseallstep.py +191 -191
- re_common/baselibrary/utils/baseavro.py +19 -19
- re_common/baselibrary/utils/baseboto3.py +291 -291
- re_common/baselibrary/utils/basecsv.py +32 -32
- re_common/baselibrary/utils/basedict.py +133 -133
- re_common/baselibrary/utils/basedir.py +241 -241
- re_common/baselibrary/utils/baseencode.py +351 -351
- re_common/baselibrary/utils/baseencoding.py +28 -28
- re_common/baselibrary/utils/baseesdsl.py +86 -86
- re_common/baselibrary/utils/baseexcel.py +264 -264
- re_common/baselibrary/utils/baseexcept.py +109 -109
- re_common/baselibrary/utils/basefile.py +654 -654
- re_common/baselibrary/utils/baseftp.py +214 -214
- re_common/baselibrary/utils/basegzip.py +60 -60
- re_common/baselibrary/utils/basehdfs.py +135 -135
- re_common/baselibrary/utils/basehttpx.py +268 -268
- re_common/baselibrary/utils/baseip.py +87 -87
- re_common/baselibrary/utils/basejson.py +2 -2
- re_common/baselibrary/utils/baselist.py +32 -32
- re_common/baselibrary/utils/basemotor.py +190 -190
- re_common/baselibrary/utils/basemssql.py +98 -98
- re_common/baselibrary/utils/baseodbc.py +113 -113
- re_common/baselibrary/utils/basepandas.py +302 -302
- re_common/baselibrary/utils/basepeewee.py +11 -11
- re_common/baselibrary/utils/basepika.py +180 -180
- re_common/baselibrary/utils/basepydash.py +143 -143
- re_common/baselibrary/utils/basepymongo.py +230 -230
- re_common/baselibrary/utils/basequeue.py +22 -22
- re_common/baselibrary/utils/baserar.py +57 -57
- re_common/baselibrary/utils/baserequest.py +279 -279
- re_common/baselibrary/utils/baseset.py +8 -8
- re_common/baselibrary/utils/basesmb.py +403 -403
- re_common/baselibrary/utils/basestring.py +382 -382
- re_common/baselibrary/utils/basetime.py +320 -320
- re_common/baselibrary/utils/baseurl.py +121 -121
- re_common/baselibrary/utils/basezip.py +57 -57
- re_common/baselibrary/utils/core/__init__.py +7 -7
- re_common/baselibrary/utils/core/bottomutils.py +18 -18
- re_common/baselibrary/utils/core/mdeprecated.py +327 -327
- re_common/baselibrary/utils/core/mlamada.py +16 -16
- re_common/baselibrary/utils/core/msginfo.py +25 -25
- re_common/baselibrary/utils/core/requests_core.py +103 -103
- re_common/baselibrary/utils/fateadm.py +429 -429
- re_common/baselibrary/utils/importfun.py +123 -123
- re_common/baselibrary/utils/mfaker.py +57 -57
- re_common/baselibrary/utils/my_abc/__init__.py +3 -3
- re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
- re_common/baselibrary/utils/mylogger.py +414 -414
- re_common/baselibrary/utils/myredisclient.py +861 -861
- re_common/baselibrary/utils/pipupgrade.py +21 -21
- re_common/baselibrary/utils/ringlist.py +85 -85
- re_common/baselibrary/utils/version_compare.py +36 -36
- re_common/baselibrary/utils/ydmhttp.py +126 -126
- re_common/facade/lazy_import.py +11 -11
- re_common/facade/loggerfacade.py +25 -25
- re_common/facade/mysqlfacade.py +467 -467
- re_common/facade/now.py +31 -31
- re_common/facade/sqlite3facade.py +257 -257
- re_common/facade/use/mq_use_facade.py +83 -83
- re_common/facade/use/proxy_use_facade.py +19 -19
- re_common/libtest/base_dict_test.py +19 -19
- re_common/libtest/baseavro_test.py +13 -13
- re_common/libtest/basefile_test.py +14 -14
- re_common/libtest/basemssql_test.py +77 -77
- re_common/libtest/baseodbc_test.py +7 -7
- re_common/libtest/basepandas_test.py +38 -38
- re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
- re_common/libtest/get_attr_test/settings.py +54 -54
- re_common/libtest/idencode_test.py +53 -53
- re_common/libtest/iniconfig_test.py +35 -35
- re_common/libtest/ip_test.py +34 -34
- re_common/libtest/merge_file_test.py +20 -20
- re_common/libtest/mfaker_test.py +8 -8
- re_common/libtest/mm3_test.py +31 -31
- re_common/libtest/mylogger_test.py +88 -88
- re_common/libtest/myparsel_test.py +27 -27
- re_common/libtest/mysql_test.py +151 -151
- re_common/libtest/pymongo_test.py +21 -21
- re_common/libtest/split_test.py +11 -11
- re_common/libtest/sqlite3_merge_test.py +5 -5
- re_common/libtest/sqlite3_test.py +34 -34
- re_common/libtest/tomlconfig_test.py +30 -30
- re_common/libtest/use_tools_test/__init__.py +2 -2
- re_common/libtest/user/__init__.py +4 -4
- re_common/studio/__init__.py +4 -4
- re_common/studio/assignment_expressions.py +36 -36
- re_common/studio/mydash/test1.py +18 -18
- re_common/studio/pydashstudio/first.py +9 -9
- re_common/studio/streamlitstudio/first_app.py +65 -65
- re_common/studio/streamlitstudio/uber_pickups.py +23 -23
- re_common/studio/test.py +18 -18
- re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +195 -0
- re_common/v2/baselibrary/business_utils/__init__.py +0 -0
- re_common/v2/baselibrary/business_utils/rel_tools.py +6 -0
- re_common/v2/baselibrary/decorators/utils.py +59 -59
- re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
- re_common/v2/baselibrary/tools/WeChatRobot.py +95 -79
- re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
- re_common/v2/baselibrary/tools/dict_tools.py +37 -37
- re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
- re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
- re_common/v2/baselibrary/tools/list_tools.py +65 -65
- re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
- re_common/v2/baselibrary/tools/text_matcher.py +326 -326
- re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
- re_common/v2/baselibrary/utils/author_smi.py +360 -360
- re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
- re_common/v2/baselibrary/utils/basedict.py +37 -37
- re_common/v2/baselibrary/utils/basehdfs.py +161 -161
- re_common/v2/baselibrary/utils/basepika.py +180 -180
- re_common/v2/baselibrary/utils/basetime.py +77 -77
- re_common/v2/baselibrary/utils/db.py +38 -38
- re_common/v2/baselibrary/utils/json_cls.py +16 -16
- re_common/v2/baselibrary/utils/mq.py +83 -83
- re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
- re_common/v2/baselibrary/utils/string_bool.py +186 -149
- re_common/v2/baselibrary/utils/string_clear.py +227 -204
- re_common/v2/baselibrary/utils/string_smi.py +18 -18
- re_common/v2/baselibrary/utils/stringutils.py +213 -213
- re_common/vip/base_step_process.py +11 -11
- re_common/vip/baseencodeid.py +90 -90
- re_common/vip/changetaskname.py +28 -28
- re_common/vip/core_var.py +24 -24
- re_common/vip/mmh3Hash.py +89 -89
- re_common/vip/proxy/allproxys.py +127 -127
- re_common/vip/proxy/allproxys_thread.py +159 -159
- re_common/vip/proxy/cnki_proxy.py +153 -153
- re_common/vip/proxy/kuaidaili.py +87 -87
- re_common/vip/proxy/proxy_all.py +113 -113
- re_common/vip/proxy/update_kuaidaili_0.py +42 -42
- re_common/vip/proxy/wanfang_proxy.py +152 -152
- re_common/vip/proxy/wp_proxy_all.py +181 -181
- re_common/vip/read_rawid_to_txt.py +91 -91
- re_common/vip/title/__init__.py +5 -5
- re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
- re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
- re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
- re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
- re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
- re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
- re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
- re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
- re_common/vip/title/transform/__init__.py +10 -10
- {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/LICENSE +201 -201
- {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/METADATA +16 -16
- re_common-10.0.24.dist-info/RECORD +230 -0
- {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/WHEEL +1 -1
- re_common-10.0.22.dist-info/RECORD +0 -227
- {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/top_level.txt +0 -0
|
@@ -1,204 +1,227 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
return self
|
|
33
|
-
|
|
34
|
-
def
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
self.obj_str = self.obj_str
|
|
46
|
-
return self
|
|
47
|
-
|
|
48
|
-
def
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
self.obj_str =
|
|
55
|
-
return self
|
|
56
|
-
|
|
57
|
-
def
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
"""
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
""
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
self.obj_str =
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
strs =
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
1
|
+
import re
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
from urllib.parse import unquote
|
|
4
|
+
|
|
5
|
+
import regex
|
|
6
|
+
|
|
7
|
+
from re_common.v2.baselibrary.utils.stringutils import (
|
|
8
|
+
qj2bj,
|
|
9
|
+
bj2qj,
|
|
10
|
+
get_diacritic_variant,
|
|
11
|
+
clean_html,
|
|
12
|
+
remove_spaces_between_chinese_characters,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@lru_cache(maxsize=1)
|
|
17
|
+
def get_cc():
|
|
18
|
+
from opencc import OpenCC
|
|
19
|
+
|
|
20
|
+
# pip install opencc-python-reimplemented
|
|
21
|
+
cc = OpenCC("t2s") # t2s是繁体转简体
|
|
22
|
+
return cc
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class StringClear(object):
|
|
26
|
+
def __init__(self, obj_str):
|
|
27
|
+
self.obj_str = obj_str
|
|
28
|
+
|
|
29
|
+
def None_to_str(self):
|
|
30
|
+
if self.obj_str is None:
|
|
31
|
+
self.obj_str = ""
|
|
32
|
+
return self
|
|
33
|
+
|
|
34
|
+
def to_str(self):
|
|
35
|
+
self.obj_str = str(self.obj_str)
|
|
36
|
+
return self
|
|
37
|
+
|
|
38
|
+
def qj_to_bj(self):
|
|
39
|
+
# 全角变半角
|
|
40
|
+
self.obj_str = qj2bj(self.obj_str)
|
|
41
|
+
return self
|
|
42
|
+
|
|
43
|
+
def bj_to_qj(self):
|
|
44
|
+
# 半角变全角
|
|
45
|
+
self.obj_str = bj2qj(self.obj_str)
|
|
46
|
+
return self
|
|
47
|
+
|
|
48
|
+
def convert_to_simplified(self):
|
|
49
|
+
# 繁体转简体
|
|
50
|
+
self.obj_str = get_cc().convert(self.obj_str)
|
|
51
|
+
return self
|
|
52
|
+
|
|
53
|
+
def lower(self):
|
|
54
|
+
self.obj_str = self.obj_str.lower()
|
|
55
|
+
return self
|
|
56
|
+
|
|
57
|
+
def upper(self):
|
|
58
|
+
self.obj_str = self.obj_str.upper()
|
|
59
|
+
return self
|
|
60
|
+
|
|
61
|
+
def collapse_spaces(self):
|
|
62
|
+
# 移除多余空格,连续多个空格变一个
|
|
63
|
+
self.obj_str = re.sub(r"\s+", " ", self.obj_str)
|
|
64
|
+
return self
|
|
65
|
+
|
|
66
|
+
def clear_all_spaces(self):
|
|
67
|
+
# 去除所有空格
|
|
68
|
+
self.obj_str = re.sub("\\s+", "", self.obj_str)
|
|
69
|
+
return self
|
|
70
|
+
|
|
71
|
+
def clean_symbols(self):
|
|
72
|
+
"""
|
|
73
|
+
清理已知的符号
|
|
74
|
+
旧版: "[\\p{P}~`=¥×\\\\*#$^|+%&~!,:.;'/{}()\\[\\]?<> 《》”“\\-()。≤《〈〉》—、·―–‐‘’“”″…¨〔〕°■『』℃ⅠⅡⅢⅣⅤⅥⅦⅩⅪⅫ]"
|
|
75
|
+
"""
|
|
76
|
+
pattern = (
|
|
77
|
+
r"[\p{P}" # 所有 Unicode 标点符号
|
|
78
|
+
r"~`=¥×\\*#$^|+%&~<> " # 未被 \p{P} 覆盖的特殊符号
|
|
79
|
+
r"”“\-≤—―–‐‘’“”″…¨°■℃" # 其他未覆盖的标点和符号
|
|
80
|
+
r"ⅠⅡⅢⅣⅤⅥⅦⅩⅪⅫ" # 罗马数字
|
|
81
|
+
r"]"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
self.obj_str = regex.sub(
|
|
85
|
+
pattern, "", self.obj_str
|
|
86
|
+
) # \\p{P} 标点符号 后面的是一些其他符号, 也可以用 \p{S} 代替 但是这个很广 可能有误伤
|
|
87
|
+
return self
|
|
88
|
+
|
|
89
|
+
def remove_special_chars(self):
|
|
90
|
+
# 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
|
|
91
|
+
self.obj_str = re.sub(r"[^\w\s]", "", self.obj_str)
|
|
92
|
+
return self
|
|
93
|
+
|
|
94
|
+
def remove_underline(self):
|
|
95
|
+
# 下划线在 \w 中 所以这里独立封装
|
|
96
|
+
self.obj_str = re.sub("[_]", "", self.obj_str)
|
|
97
|
+
return self
|
|
98
|
+
|
|
99
|
+
def replace_dash_with_space(self):
|
|
100
|
+
# 横杆转空格
|
|
101
|
+
self.obj_str = self.obj_str.replace("-", " ")
|
|
102
|
+
return self
|
|
103
|
+
|
|
104
|
+
def strip_quotes(self):
|
|
105
|
+
# 清理 双引号
|
|
106
|
+
self.obj_str = self.obj_str.replace('"', "")
|
|
107
|
+
return self
|
|
108
|
+
|
|
109
|
+
def remove_diacritics(self):
|
|
110
|
+
# 去除音标 转换成字母
|
|
111
|
+
self.obj_str = get_diacritic_variant(self.obj_str)
|
|
112
|
+
return self
|
|
113
|
+
|
|
114
|
+
def remove_brackets(self):
|
|
115
|
+
# 移除 方括号里面的内容
|
|
116
|
+
self.obj_str = re.sub("\\[.*?]", "", self.obj_str)
|
|
117
|
+
return self
|
|
118
|
+
|
|
119
|
+
def remove_parentheses(self):
|
|
120
|
+
# 移除圆括号的内容
|
|
121
|
+
self.obj_str = re.sub("\\(.*?\\)", "", self.obj_str)
|
|
122
|
+
return self
|
|
123
|
+
|
|
124
|
+
def remove_html_tag(self):
|
|
125
|
+
# 去除 html 标签
|
|
126
|
+
import html
|
|
127
|
+
|
|
128
|
+
self.obj_str = html.unescape(self.obj_str)
|
|
129
|
+
|
|
130
|
+
self.obj_str = clean_html(self.obj_str)
|
|
131
|
+
|
|
132
|
+
return self
|
|
133
|
+
|
|
134
|
+
def remove_spaces_in_chinese_characters(self):
|
|
135
|
+
# 匹配中文间的空格并替换为空字符串
|
|
136
|
+
self.obj_str = remove_spaces_between_chinese_characters(self.obj_str)
|
|
137
|
+
return self
|
|
138
|
+
|
|
139
|
+
def url_to_str(self):
|
|
140
|
+
"""
|
|
141
|
+
url 编码转字符
|
|
142
|
+
"""
|
|
143
|
+
self.obj_str = unquote(self.obj_str)
|
|
144
|
+
return self
|
|
145
|
+
|
|
146
|
+
def get_str(self):
|
|
147
|
+
return self.obj_str
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def rel_clear(str_obj):
|
|
151
|
+
# 为融合数据定制的 清理规则
|
|
152
|
+
return (
|
|
153
|
+
StringClear(str_obj)
|
|
154
|
+
.None_to_str() # 空对象转str 防止空对象
|
|
155
|
+
.to_str() # 防止其他类型传入 比如 int double
|
|
156
|
+
.qj_to_bj() # 全角转半角
|
|
157
|
+
.remove_html_tag() # html标签清理
|
|
158
|
+
.remove_special_chars() # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
|
|
159
|
+
.collapse_spaces() # 移除多余空格,连续多个空格变一个
|
|
160
|
+
.remove_spaces_in_chinese_characters() # 匹配中文间的空格并替换为空字符串
|
|
161
|
+
.convert_to_simplified() # 繁体转简体
|
|
162
|
+
.lower() # 小写
|
|
163
|
+
.get_str() # 获取str
|
|
164
|
+
.strip()
|
|
165
|
+
) # 去掉空格
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def clear_au_organ(str_obj):
|
|
169
|
+
"""
|
|
170
|
+
为作者机构定制的清理 与上面比除了不转小写外 还多了些特殊的清理
|
|
171
|
+
"""
|
|
172
|
+
strs = (
|
|
173
|
+
StringClear(str_obj)
|
|
174
|
+
.None_to_str() # None 转 空字符串
|
|
175
|
+
.to_str() # 防止其他类型传入 比如 int double
|
|
176
|
+
.qj_to_bj() # 全角转半角
|
|
177
|
+
.strip_quotes() # 清理 双引号
|
|
178
|
+
.clean_symbols() # 清理已知的符号
|
|
179
|
+
.collapse_spaces() # 移除多余空格,连续多个空格变一个
|
|
180
|
+
.convert_to_simplified() # 繁体转简体
|
|
181
|
+
.get_str() # 获取str
|
|
182
|
+
.strip() # 去掉空格
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
strs = strs.replace("lt正gt", "").strip() # 特殊需求
|
|
186
|
+
return strs
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def ref_clear(str_obj):
|
|
190
|
+
# 为 引文 数据定制的清理
|
|
191
|
+
strs = (
|
|
192
|
+
StringClear(str_obj)
|
|
193
|
+
.None_to_str() # None 转 空字符串
|
|
194
|
+
.remove_html_tag() # 清理html标签
|
|
195
|
+
.to_str() # 防止其他类型传入 比如 int double
|
|
196
|
+
.qj_to_bj() # 全角转半角
|
|
197
|
+
.strip_quotes() # 清理 双引号
|
|
198
|
+
.clean_symbols() # 清理已知的符号
|
|
199
|
+
.collapse_spaces() # 移除多余空格,连续多个空格变一个
|
|
200
|
+
.lower() # 小写
|
|
201
|
+
.remove_diacritics() # 去除音标 转换成字母
|
|
202
|
+
.get_str() # 获取str
|
|
203
|
+
.strip() # 去掉空格
|
|
204
|
+
)
|
|
205
|
+
return strs
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def clear_obj(str_obj):
|
|
209
|
+
# 为对象化定制的清理
|
|
210
|
+
str_obj = clear_au_organ(str_obj)
|
|
211
|
+
# str_obj = str_obj.replace("ß", "SS") # "ß" 的 大写就是 "SS"
|
|
212
|
+
result = (
|
|
213
|
+
StringClear(str_obj)
|
|
214
|
+
.remove_diacritics() # 清理音标
|
|
215
|
+
.upper()
|
|
216
|
+
.get_str() # 获取str
|
|
217
|
+
.strip() # 去掉空格
|
|
218
|
+
)
|
|
219
|
+
return result
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def normalize_title_for_es(title: str):
|
|
223
|
+
_title = StringClear(title).convert_to_simplified().qj_to_bj().get_str()
|
|
224
|
+
has_chinese = re.search(r"[\u4e00-\u9fa5]", _title)
|
|
225
|
+
if not has_chinese:
|
|
226
|
+
_title = re.sub(r"[-—‑–−―-]", " ", _title)
|
|
227
|
+
return _title.strip()
|
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
import jellyfish
|
|
2
|
-
from rapidfuzz.distance import DamerauLevenshtein
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class JaroDamerauLevenshteinMaxSim(object):
|
|
6
|
-
"""
|
|
7
|
-
jaro_similarity 有缺陷 以下样例数据会导致分很低
|
|
8
|
-
s1 = "in situ monitoring of semiconductor wafer temperature using infrared interfe rometry"
|
|
9
|
-
s2 = "insitu monitoring of semiconductor wafer temperature using infrared interferometry"
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
def get_sim(self, str1: str, str2: str) -> float:
|
|
13
|
-
similarity1 = jellyfish.jaro_similarity(str1, str2)
|
|
14
|
-
if str1.strip() == "" and str2.strip() == "":
|
|
15
|
-
similarity2 = 0
|
|
16
|
-
else:
|
|
17
|
-
similarity2 = 1 - DamerauLevenshtein.normalized_distance(str1, str2)
|
|
18
|
-
return max(similarity1, similarity2)
|
|
1
|
+
import jellyfish
|
|
2
|
+
from rapidfuzz.distance import DamerauLevenshtein
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class JaroDamerauLevenshteinMaxSim(object):
|
|
6
|
+
"""
|
|
7
|
+
jaro_similarity 有缺陷 以下样例数据会导致分很低
|
|
8
|
+
s1 = "in situ monitoring of semiconductor wafer temperature using infrared interfe rometry"
|
|
9
|
+
s2 = "insitu monitoring of semiconductor wafer temperature using infrared interferometry"
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def get_sim(self, str1: str, str2: str) -> float:
|
|
13
|
+
similarity1 = jellyfish.jaro_similarity(str1, str2)
|
|
14
|
+
if str1.strip() == "" and str2.strip() == "":
|
|
15
|
+
similarity2 = 0
|
|
16
|
+
else:
|
|
17
|
+
similarity2 = 1 - DamerauLevenshtein.normalized_distance(str1, str2)
|
|
18
|
+
return max(similarity1, similarity2)
|