re-common 10.0.37__py3-none-any.whl → 10.0.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/baselibrary/__init__.py +4 -4
- re_common/baselibrary/baseabs/__init__.py +6 -6
- re_common/baselibrary/baseabs/baseabs.py +26 -26
- re_common/baselibrary/database/mbuilder.py +132 -132
- re_common/baselibrary/database/moudle.py +93 -93
- re_common/baselibrary/database/msqlite3.py +194 -194
- re_common/baselibrary/database/mysql.py +169 -169
- re_common/baselibrary/database/sql_factory.py +26 -26
- re_common/baselibrary/mthread/MThreadingRun.py +486 -486
- re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
- re_common/baselibrary/mthread/__init__.py +2 -2
- re_common/baselibrary/mthread/mythreading.py +695 -695
- re_common/baselibrary/pakge_other/socks.py +404 -404
- re_common/baselibrary/readconfig/config_factory.py +18 -18
- re_common/baselibrary/readconfig/ini_config.py +317 -317
- re_common/baselibrary/readconfig/toml_config.py +49 -49
- re_common/baselibrary/temporary/envdata.py +36 -36
- re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
- re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
- re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
- re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
- re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
- re_common/baselibrary/tools/contrast_db3.py +123 -123
- re_common/baselibrary/tools/copy_file.py +39 -39
- re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
- re_common/baselibrary/tools/foreachgz.py +39 -39
- re_common/baselibrary/tools/get_attr.py +10 -10
- re_common/baselibrary/tools/image_to_pdf.py +61 -61
- re_common/baselibrary/tools/java_code_deal.py +139 -139
- re_common/baselibrary/tools/javacode.py +79 -79
- re_common/baselibrary/tools/mdb_db3.py +48 -48
- re_common/baselibrary/tools/merge_file.py +171 -171
- re_common/baselibrary/tools/merge_gz_file.py +165 -165
- re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
- re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
- re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
- re_common/baselibrary/tools/mongo_tools.py +50 -50
- re_common/baselibrary/tools/move_file.py +170 -170
- re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
- re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
- re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
- re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
- re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
- re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
- re_common/baselibrary/tools/myparsel.py +104 -104
- re_common/baselibrary/tools/rename_dir_file.py +37 -37
- re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
- re_common/baselibrary/tools/split_line_to_many.py +25 -25
- re_common/baselibrary/tools/stringtodicts.py +33 -33
- re_common/baselibrary/tools/workwechant_bot.py +84 -84
- re_common/baselibrary/utils/baseaiohttp.py +296 -296
- re_common/baselibrary/utils/baseaiomysql.py +87 -87
- re_common/baselibrary/utils/baseallstep.py +191 -191
- re_common/baselibrary/utils/baseavro.py +19 -19
- re_common/baselibrary/utils/baseboto3.py +291 -291
- re_common/baselibrary/utils/basecsv.py +32 -32
- re_common/baselibrary/utils/basedict.py +133 -133
- re_common/baselibrary/utils/basedir.py +241 -241
- re_common/baselibrary/utils/baseencode.py +351 -351
- re_common/baselibrary/utils/baseencoding.py +28 -28
- re_common/baselibrary/utils/baseesdsl.py +86 -86
- re_common/baselibrary/utils/baseexcel.py +264 -264
- re_common/baselibrary/utils/baseexcept.py +109 -109
- re_common/baselibrary/utils/basefile.py +654 -654
- re_common/baselibrary/utils/baseftp.py +214 -214
- re_common/baselibrary/utils/basegzip.py +60 -60
- re_common/baselibrary/utils/basehdfs.py +135 -135
- re_common/baselibrary/utils/basehttpx.py +268 -268
- re_common/baselibrary/utils/baseip.py +87 -87
- re_common/baselibrary/utils/basejson.py +2 -2
- re_common/baselibrary/utils/baselist.py +32 -32
- re_common/baselibrary/utils/basemotor.py +190 -190
- re_common/baselibrary/utils/basemssql.py +98 -98
- re_common/baselibrary/utils/baseodbc.py +113 -113
- re_common/baselibrary/utils/basepandas.py +302 -302
- re_common/baselibrary/utils/basepeewee.py +11 -11
- re_common/baselibrary/utils/basepika.py +180 -180
- re_common/baselibrary/utils/basepydash.py +143 -143
- re_common/baselibrary/utils/basepymongo.py +230 -230
- re_common/baselibrary/utils/basequeue.py +22 -22
- re_common/baselibrary/utils/baserar.py +57 -57
- re_common/baselibrary/utils/baserequest.py +279 -279
- re_common/baselibrary/utils/baseset.py +8 -8
- re_common/baselibrary/utils/basesmb.py +403 -403
- re_common/baselibrary/utils/basestring.py +382 -382
- re_common/baselibrary/utils/basetime.py +320 -320
- re_common/baselibrary/utils/baseurl.py +121 -121
- re_common/baselibrary/utils/basezip.py +57 -57
- re_common/baselibrary/utils/core/__init__.py +7 -7
- re_common/baselibrary/utils/core/bottomutils.py +18 -18
- re_common/baselibrary/utils/core/mdeprecated.py +327 -327
- re_common/baselibrary/utils/core/mlamada.py +16 -16
- re_common/baselibrary/utils/core/msginfo.py +25 -25
- re_common/baselibrary/utils/core/requests_core.py +103 -103
- re_common/baselibrary/utils/fateadm.py +429 -429
- re_common/baselibrary/utils/importfun.py +123 -123
- re_common/baselibrary/utils/mfaker.py +57 -57
- re_common/baselibrary/utils/my_abc/__init__.py +3 -3
- re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
- re_common/baselibrary/utils/mylogger.py +414 -414
- re_common/baselibrary/utils/myredisclient.py +861 -861
- re_common/baselibrary/utils/pipupgrade.py +21 -21
- re_common/baselibrary/utils/ringlist.py +85 -85
- re_common/baselibrary/utils/version_compare.py +36 -36
- re_common/baselibrary/utils/ydmhttp.py +126 -126
- re_common/facade/lazy_import.py +11 -11
- re_common/facade/loggerfacade.py +25 -25
- re_common/facade/mysqlfacade.py +467 -467
- re_common/facade/now.py +31 -31
- re_common/facade/sqlite3facade.py +257 -257
- re_common/facade/use/mq_use_facade.py +83 -83
- re_common/facade/use/proxy_use_facade.py +19 -19
- re_common/libtest/base_dict_test.py +19 -19
- re_common/libtest/baseavro_test.py +13 -13
- re_common/libtest/basefile_test.py +14 -14
- re_common/libtest/basemssql_test.py +77 -77
- re_common/libtest/baseodbc_test.py +7 -7
- re_common/libtest/basepandas_test.py +38 -38
- re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
- re_common/libtest/get_attr_test/settings.py +54 -54
- re_common/libtest/idencode_test.py +53 -53
- re_common/libtest/iniconfig_test.py +35 -35
- re_common/libtest/ip_test.py +34 -34
- re_common/libtest/merge_file_test.py +20 -20
- re_common/libtest/mfaker_test.py +8 -8
- re_common/libtest/mm3_test.py +31 -31
- re_common/libtest/mylogger_test.py +88 -88
- re_common/libtest/myparsel_test.py +27 -27
- re_common/libtest/mysql_test.py +151 -151
- re_common/libtest/pymongo_test.py +21 -21
- re_common/libtest/split_test.py +11 -11
- re_common/libtest/sqlite3_merge_test.py +5 -5
- re_common/libtest/sqlite3_test.py +34 -34
- re_common/libtest/tomlconfig_test.py +30 -30
- re_common/libtest/use_tools_test/__init__.py +2 -2
- re_common/libtest/user/__init__.py +4 -4
- re_common/studio/__init__.py +4 -4
- re_common/studio/assignment_expressions.py +36 -36
- re_common/studio/mydash/test1.py +18 -18
- re_common/studio/pydashstudio/first.py +9 -9
- re_common/studio/streamlitstudio/first_app.py +65 -65
- re_common/studio/streamlitstudio/uber_pickups.py +23 -23
- re_common/studio/test.py +18 -18
- re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +219 -219
- re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
- re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
- re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
- re_common/v2/baselibrary/decorators/utils.py +59 -59
- re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
- re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
- re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
- re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
- re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
- re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
- re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
- re_common/v2/baselibrary/tools/concurrency.py +35 -35
- re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
- re_common/v2/baselibrary/tools/data_processer/data_processer.py +508 -508
- re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
- re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
- re_common/v2/baselibrary/tools/dict_tools.py +44 -44
- re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
- re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
- re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
- re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
- re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
- re_common/v2/baselibrary/tools/list_tools.py +69 -69
- re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
- re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
- re_common/v2/baselibrary/tools/text_matcher.py +326 -326
- re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
- re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
- re_common/v2/baselibrary/utils/author_smi.py +361 -361
- re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
- re_common/v2/baselibrary/utils/basedict.py +37 -37
- re_common/v2/baselibrary/utils/basehdfs.py +163 -163
- re_common/v2/baselibrary/utils/basepika.py +180 -180
- re_common/v2/baselibrary/utils/basetime.py +77 -77
- re_common/v2/baselibrary/utils/db.py +156 -156
- re_common/v2/baselibrary/utils/json_cls.py +16 -16
- re_common/v2/baselibrary/utils/mq.py +83 -83
- re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
- re_common/v2/baselibrary/utils/string_bool.py +186 -186
- re_common/v2/baselibrary/utils/string_clear.py +246 -246
- re_common/v2/baselibrary/utils/string_smi.py +18 -18
- re_common/v2/baselibrary/utils/stringutils.py +271 -278
- re_common/vip/base_step_process.py +11 -11
- re_common/vip/baseencodeid.py +90 -90
- re_common/vip/changetaskname.py +28 -28
- re_common/vip/core_var.py +24 -24
- re_common/vip/mmh3Hash.py +89 -89
- re_common/vip/proxy/allproxys.py +127 -127
- re_common/vip/proxy/allproxys_thread.py +159 -159
- re_common/vip/proxy/cnki_proxy.py +153 -153
- re_common/vip/proxy/kuaidaili.py +87 -87
- re_common/vip/proxy/proxy_all.py +113 -113
- re_common/vip/proxy/update_kuaidaili_0.py +42 -42
- re_common/vip/proxy/wanfang_proxy.py +152 -152
- re_common/vip/proxy/wp_proxy_all.py +181 -181
- re_common/vip/read_rawid_to_txt.py +91 -91
- re_common/vip/title/__init__.py +5 -5
- re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
- re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
- re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
- re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
- re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
- re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
- re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
- re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
- re_common/vip/title/transform/__init__.py +10 -10
- {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/LICENSE +201 -201
- {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/METADATA +16 -16
- re_common-10.0.39.dist-info/RECORD +248 -0
- {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/WHEEL +1 -1
- re_common-10.0.37.dist-info/RECORD +0 -248
- {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/top_level.txt +0 -0
|
@@ -1,278 +1,271 @@
|
|
|
1
|
-
import re
|
|
2
|
-
import threading
|
|
3
|
-
from html.parser import HTMLParser
|
|
4
|
-
from itertools import combinations
|
|
5
|
-
|
|
6
|
-
import regex
|
|
7
|
-
import unicodedata
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
"""
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
r"
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
#
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
def
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
#
|
|
176
|
-
#
|
|
177
|
-
#
|
|
178
|
-
#
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
return
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
返回:
|
|
274
|
-
str: 只包含 Unicode 字母和数字的文本。
|
|
275
|
-
\p{N} 匹配所有 Unicode 数字字符 包括非阿拉伯数字字符
|
|
276
|
-
\p{L} 匹配所有语言字符
|
|
277
|
-
"""
|
|
278
|
-
return regex.sub(r"[^\p{L}\p{N}]+", "", text)
|
|
1
|
+
import re
|
|
2
|
+
import threading
|
|
3
|
+
from html.parser import HTMLParser
|
|
4
|
+
from itertools import combinations
|
|
5
|
+
|
|
6
|
+
import regex
|
|
7
|
+
import unicodedata
|
|
8
|
+
|
|
9
|
+
from re_common.v2.baselibrary.utils.string_smi import JaroDamerauLevenshteinMaxSim
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def bj2qj(src):
|
|
13
|
+
if src is None:
|
|
14
|
+
return src
|
|
15
|
+
|
|
16
|
+
DBC_SPACE = ' '
|
|
17
|
+
SBC_SPACE = ' '
|
|
18
|
+
DBC_CHAR_START = 33
|
|
19
|
+
DBC_CHAR_END = 126
|
|
20
|
+
CONVERT_STEP = 65248
|
|
21
|
+
|
|
22
|
+
buf = []
|
|
23
|
+
for char in src:
|
|
24
|
+
if char == DBC_SPACE:
|
|
25
|
+
buf.append(SBC_SPACE)
|
|
26
|
+
elif DBC_CHAR_START <= ord(char) <= DBC_CHAR_END:
|
|
27
|
+
buf.append(chr(ord(char) + CONVERT_STEP))
|
|
28
|
+
else:
|
|
29
|
+
buf.append(char)
|
|
30
|
+
|
|
31
|
+
return ''.join(buf)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def qj2bj(text):
|
|
35
|
+
if text is None:
|
|
36
|
+
return text
|
|
37
|
+
# 预构建全角到半角的转换映射表(只需构建一次)
|
|
38
|
+
if not hasattr(qj2bj, 'trans_table'):
|
|
39
|
+
trans_map = {}
|
|
40
|
+
# 处理全角空格
|
|
41
|
+
trans_map[0x3000] = 0x0020
|
|
42
|
+
# 处理全角字符范围FF01-FF5E
|
|
43
|
+
for code in range(0xFF01, 0xFF5F):
|
|
44
|
+
trans_map[code] = code - 0xFEE0
|
|
45
|
+
# 创建转换表(字符到字符的映射)
|
|
46
|
+
qj2bj.trans_table = str.maketrans(
|
|
47
|
+
{chr(k): chr(v) for k, v in trans_map.items()}
|
|
48
|
+
)
|
|
49
|
+
# 使用预编译的转换表进行高效替换
|
|
50
|
+
return text.translate(qj2bj.trans_table)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
"""
|
|
54
|
+
总结对比表
|
|
55
|
+
规范名 处理步骤 组合方式 兼容性归一化 主要用途
|
|
56
|
+
NFC 规范分解 → 规范组合 组合 否 保留预组合字符,文本呈现和存储
|
|
57
|
+
NFD 规范分解 不组合 否 拆解字符,便于逐字符处理
|
|
58
|
+
NFKC 兼容性分解 → 规范组合 组合 是 消除兼容差异,文本比较和索引
|
|
59
|
+
NFKD 兼容性分解 → 规范分解 不组合 是 最大程度拆解,文本分析和预处理
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def get_diacritic_variant(char1):
|
|
64
|
+
"""
|
|
65
|
+
NFD: 规范分解(Normalization Form D)
|
|
66
|
+
把字符拆分为基本字符 + 变音符号
|
|
67
|
+
|
|
68
|
+
但不处理兼容字符(如连字)
|
|
69
|
+
|
|
70
|
+
print(unicodedata.normalize('NFD', 'é')) # 输出: 'é'(e + 组合符号) # 这里看起来是1个字符 len 其实是2
|
|
71
|
+
print(unicodedata.normalize('NFD', 'fl')) # 输出: 'fl'(不变化)
|
|
72
|
+
|
|
73
|
+
"""
|
|
74
|
+
# 将字符转换为标准的 Unicode 形式
|
|
75
|
+
normalized_char1 = unicodedata.normalize('NFD', char1)
|
|
76
|
+
|
|
77
|
+
# 获取基本字符(去掉变音符号)
|
|
78
|
+
base_char1 = ''.join(c for c in normalized_char1 if unicodedata.category(c) != 'Mn')
|
|
79
|
+
|
|
80
|
+
# 判断基本字符是否相同
|
|
81
|
+
return base_char1
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def normalize_nfkc(strs: str) -> str:
|
|
85
|
+
"""
|
|
86
|
+
NFKC: 兼容字符归一化 + 组合(Normalization Form Compatibility Composition)
|
|
87
|
+
把 连字、圈数字、全角字符 等兼容字符转换为标准形式
|
|
88
|
+
|
|
89
|
+
同时做字符合并(例如 é 不再是 e+´,而是一个字符)
|
|
90
|
+
print(unicodedata.normalize('NFKC', 'fl')) # 输出: 'fl'
|
|
91
|
+
print(unicodedata.normalize('NFKC', '①')) # 输出: '1'
|
|
92
|
+
print(unicodedata.normalize('NFKC', 'A')) # 输出: 'A'
|
|
93
|
+
"""
|
|
94
|
+
return unicodedata.normalize('NFKC', strs.strip())
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_alphabetic_ratio(text: str) -> float:
|
|
98
|
+
# 返回字母型字符所占比例
|
|
99
|
+
if not text:
|
|
100
|
+
return 0
|
|
101
|
+
|
|
102
|
+
text = re.sub(r'\d+', '', text)
|
|
103
|
+
|
|
104
|
+
# 正则表达式匹配字母型文字(包括拉丁字母、希腊字母、西里尔字母、阿拉伯字母等)
|
|
105
|
+
alphabetic_pattern = (
|
|
106
|
+
r"[\u0041-\u005A\u0061-\u007A" # 拉丁字母 (A-Z, a-z)
|
|
107
|
+
r"\u00C0-\u00FF" # 带重音符号的拉丁字母 (À-ÿ)
|
|
108
|
+
r"\u0080–\u00FF" # 拉丁字母补充1
|
|
109
|
+
r"\u0100–\u017F" # 拉丁字母扩展A
|
|
110
|
+
r"\u1E00-\u1EFF" # 拉丁扩展 (Latin Extended Additional)
|
|
111
|
+
r"\u0180-\u024F" # 拉丁扩展-B (Latin Extended-B)
|
|
112
|
+
r"\u2C60-\u2C7F" # 拉丁扩展-C (Latin Extended Additional)
|
|
113
|
+
r"\uA720-\uA7FF" # 拉丁扩展-D (Latin Extended Additional)
|
|
114
|
+
r"\uAB30-\uAB6F" # 拉丁扩展-E (Latin Extended Additional)
|
|
115
|
+
r"]"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# 使用正则表达式过滤出语言文字
|
|
119
|
+
clean_text = regex.sub(r"[^\p{L}]", "", text)
|
|
120
|
+
|
|
121
|
+
if len(clean_text) == 0:
|
|
122
|
+
return 1.0
|
|
123
|
+
|
|
124
|
+
# 匹配所有字母型字符
|
|
125
|
+
alphabetic_chars = re.findall(alphabetic_pattern, clean_text)
|
|
126
|
+
|
|
127
|
+
# 返回字母型字符所占比例
|
|
128
|
+
return len(alphabetic_chars) / len(clean_text)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class HTMLTextExtractor(HTMLParser):
|
|
132
|
+
_thread_local = threading.local() # 线程局部存储
|
|
133
|
+
|
|
134
|
+
def __init__(self):
|
|
135
|
+
super().__init__()
|
|
136
|
+
self.reset_state()
|
|
137
|
+
|
|
138
|
+
def handle_starttag(self, tag, attrs):
|
|
139
|
+
if tag in ('script', 'style'):
|
|
140
|
+
self.skip = True
|
|
141
|
+
|
|
142
|
+
def handle_endtag(self, tag):
|
|
143
|
+
if tag in ('script', 'style'):
|
|
144
|
+
self.skip = False
|
|
145
|
+
|
|
146
|
+
def handle_data(self, data):
|
|
147
|
+
if not self.skip and data.strip():
|
|
148
|
+
self.text.append(data)
|
|
149
|
+
|
|
150
|
+
def reset_state(self):
|
|
151
|
+
self.reset()
|
|
152
|
+
self.text = []
|
|
153
|
+
self.skip = False
|
|
154
|
+
|
|
155
|
+
def get_text(self):
|
|
156
|
+
return ''.join(self.text).strip()
|
|
157
|
+
|
|
158
|
+
@classmethod
|
|
159
|
+
def get_parser(cls):
|
|
160
|
+
# 每个线程获取独立实例
|
|
161
|
+
if not hasattr(cls._thread_local, 'parser'):
|
|
162
|
+
cls._thread_local.parser = cls()
|
|
163
|
+
return cls._thread_local.parser
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# def clean_html(html):
|
|
167
|
+
# parser = HTMLTextExtractor.get_parser()
|
|
168
|
+
# parser.reset_state()
|
|
169
|
+
# parser.feed(html)
|
|
170
|
+
# parser.close()
|
|
171
|
+
# return parser.get_text()
|
|
172
|
+
|
|
173
|
+
# def clean_html(html):
|
|
174
|
+
# """使用 Parsel 提取 HTML 中的纯文本"""
|
|
175
|
+
# sel = Selector(text=html, type='html')
|
|
176
|
+
# # 提取所有文本(包括子元素的文本)
|
|
177
|
+
# text = sel.xpath("string()").getall()
|
|
178
|
+
# return "".join(text).strip()
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def clean_html(html):
|
|
182
|
+
if "<" in html:
|
|
183
|
+
from bs4 import BeautifulSoup
|
|
184
|
+
try:
|
|
185
|
+
soup = BeautifulSoup(html, "lxml")
|
|
186
|
+
return soup.get_text()
|
|
187
|
+
except:
|
|
188
|
+
soup = BeautifulSoup(html, "html5lib")
|
|
189
|
+
return soup.get_text()
|
|
190
|
+
return html
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def remove_spaces_between_chinese_characters(text):
|
|
194
|
+
"""
|
|
195
|
+
匹配中文间的空格并替换为空字符串
|
|
196
|
+
|
|
197
|
+
这里没有选取 后面的一些扩展分区 是那些分区比较分散 都写进来消耗性能,
|
|
198
|
+
认为只包含这些也够用了
|
|
199
|
+
"""
|
|
200
|
+
pattern = r'(?<=[\u3400-\u9fff])\s+(?=[\u3400-\u9fff])'
|
|
201
|
+
return re.sub(pattern, '', text)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
sim_utils = JaroDamerauLevenshteinMaxSim()
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def group_similar_texts(texts, threshold=0.9):
|
|
208
|
+
"""根据相似度对文本进行分组"""
|
|
209
|
+
from re_common.v2.baselibrary.utils.string_clear import rel_clear
|
|
210
|
+
n = len(texts)
|
|
211
|
+
# 创建邻接表表示图
|
|
212
|
+
graph = [[] for _ in range(n)]
|
|
213
|
+
# 计算所有文本对的相似度并构建图
|
|
214
|
+
for i, j in combinations(range(n), 2):
|
|
215
|
+
similarity = sim_utils.get_sim(rel_clear(texts[i]), rel_clear(texts[j]))
|
|
216
|
+
if similarity >= threshold:
|
|
217
|
+
graph[i].append(j)
|
|
218
|
+
graph[j].append(i)
|
|
219
|
+
|
|
220
|
+
visited = [False] * n
|
|
221
|
+
groups = []
|
|
222
|
+
|
|
223
|
+
# 使用DFS找到连通分量
|
|
224
|
+
def dfs(node, group):
|
|
225
|
+
visited[node] = True
|
|
226
|
+
group.append(node)
|
|
227
|
+
for neighbor in graph[node]:
|
|
228
|
+
if not visited[neighbor]:
|
|
229
|
+
dfs(neighbor, group)
|
|
230
|
+
|
|
231
|
+
# 找到所有连通分量
|
|
232
|
+
for i in range(n):
|
|
233
|
+
if not visited[i]:
|
|
234
|
+
current_group = []
|
|
235
|
+
dfs(i, current_group)
|
|
236
|
+
groups.append(current_group)
|
|
237
|
+
|
|
238
|
+
return groups
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def get_group_abstract(lists):
|
|
242
|
+
"""
|
|
243
|
+
这是一个 分组程序 ,会根据简单的连通图分组
|
|
244
|
+
lists: [(id,txt),...]
|
|
245
|
+
return: all_list 返回一个二维列表 每个列表里面是id 每个列表为一个分组
|
|
246
|
+
"""
|
|
247
|
+
abstract_list = [i[1] for i in lists]
|
|
248
|
+
keyid_list = [i[0] for i in lists]
|
|
249
|
+
groups = group_similar_texts(abstract_list, threshold=0.9)
|
|
250
|
+
all_list = []
|
|
251
|
+
for group in groups:
|
|
252
|
+
t_list = []
|
|
253
|
+
for text_idx in group:
|
|
254
|
+
t_list.append(keyid_list[text_idx])
|
|
255
|
+
all_list.append(t_list)
|
|
256
|
+
return all_list
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def clean_unicode_alnum(text: str) -> str:
|
|
260
|
+
"""
|
|
261
|
+
清除所有非 Unicode 字母或数字的字符。
|
|
262
|
+
|
|
263
|
+
参数:
|
|
264
|
+
text (str): 输入文本。
|
|
265
|
+
|
|
266
|
+
返回:
|
|
267
|
+
str: 只包含 Unicode 字母和数字的文本。
|
|
268
|
+
\p{N} 匹配所有 Unicode 数字字符 包括非阿拉伯数字字符
|
|
269
|
+
\p{L} 匹配所有语言字符
|
|
270
|
+
"""
|
|
271
|
+
return regex.sub(r"[^\p{L}\p{N}]+", "", text)
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class BaseStepProcess(ABC):
|
|
5
|
-
|
|
6
|
-
def __init__(self):
|
|
7
|
-
self.stat_dicts = {}
|
|
8
|
-
|
|
9
|
-
@abstractmethod
|
|
10
|
-
def do_task(self, *args, **kwargs):
|
|
11
|
-
pass
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class BaseStepProcess(ABC):
|
|
5
|
+
|
|
6
|
+
def __init__(self):
|
|
7
|
+
self.stat_dicts = {}
|
|
8
|
+
|
|
9
|
+
@abstractmethod
|
|
10
|
+
def do_task(self, *args, **kwargs):
|
|
11
|
+
pass
|