re-common 10.0.39__py3-none-any.whl → 10.0.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/baselibrary/__init__.py +4 -4
- re_common/baselibrary/baseabs/__init__.py +6 -6
- re_common/baselibrary/baseabs/baseabs.py +26 -26
- re_common/baselibrary/database/mbuilder.py +132 -132
- re_common/baselibrary/database/moudle.py +93 -93
- re_common/baselibrary/database/msqlite3.py +194 -194
- re_common/baselibrary/database/mysql.py +169 -169
- re_common/baselibrary/database/sql_factory.py +26 -26
- re_common/baselibrary/mthread/MThreadingRun.py +486 -486
- re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
- re_common/baselibrary/mthread/__init__.py +2 -2
- re_common/baselibrary/mthread/mythreading.py +695 -695
- re_common/baselibrary/pakge_other/socks.py +404 -404
- re_common/baselibrary/readconfig/config_factory.py +18 -18
- re_common/baselibrary/readconfig/ini_config.py +317 -317
- re_common/baselibrary/readconfig/toml_config.py +49 -49
- re_common/baselibrary/temporary/envdata.py +36 -36
- re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
- re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
- re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
- re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
- re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
- re_common/baselibrary/tools/contrast_db3.py +123 -123
- re_common/baselibrary/tools/copy_file.py +39 -39
- re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
- re_common/baselibrary/tools/foreachgz.py +39 -39
- re_common/baselibrary/tools/get_attr.py +10 -10
- re_common/baselibrary/tools/image_to_pdf.py +61 -61
- re_common/baselibrary/tools/java_code_deal.py +139 -139
- re_common/baselibrary/tools/javacode.py +79 -79
- re_common/baselibrary/tools/mdb_db3.py +48 -48
- re_common/baselibrary/tools/merge_file.py +171 -171
- re_common/baselibrary/tools/merge_gz_file.py +165 -165
- re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
- re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
- re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
- re_common/baselibrary/tools/mongo_tools.py +50 -50
- re_common/baselibrary/tools/move_file.py +170 -170
- re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
- re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
- re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
- re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
- re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
- re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
- re_common/baselibrary/tools/myparsel.py +104 -104
- re_common/baselibrary/tools/rename_dir_file.py +37 -37
- re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
- re_common/baselibrary/tools/split_line_to_many.py +25 -25
- re_common/baselibrary/tools/stringtodicts.py +33 -33
- re_common/baselibrary/tools/workwechant_bot.py +84 -84
- re_common/baselibrary/utils/baseaiohttp.py +296 -296
- re_common/baselibrary/utils/baseaiomysql.py +87 -87
- re_common/baselibrary/utils/baseallstep.py +191 -191
- re_common/baselibrary/utils/baseavro.py +19 -19
- re_common/baselibrary/utils/baseboto3.py +291 -291
- re_common/baselibrary/utils/basecsv.py +32 -32
- re_common/baselibrary/utils/basedict.py +133 -133
- re_common/baselibrary/utils/basedir.py +241 -241
- re_common/baselibrary/utils/baseencode.py +351 -351
- re_common/baselibrary/utils/baseencoding.py +28 -28
- re_common/baselibrary/utils/baseesdsl.py +86 -86
- re_common/baselibrary/utils/baseexcel.py +264 -264
- re_common/baselibrary/utils/baseexcept.py +109 -109
- re_common/baselibrary/utils/basefile.py +654 -654
- re_common/baselibrary/utils/baseftp.py +214 -214
- re_common/baselibrary/utils/basegzip.py +60 -60
- re_common/baselibrary/utils/basehdfs.py +135 -135
- re_common/baselibrary/utils/basehttpx.py +268 -268
- re_common/baselibrary/utils/baseip.py +87 -87
- re_common/baselibrary/utils/basejson.py +2 -2
- re_common/baselibrary/utils/baselist.py +32 -32
- re_common/baselibrary/utils/basemotor.py +190 -190
- re_common/baselibrary/utils/basemssql.py +98 -98
- re_common/baselibrary/utils/baseodbc.py +113 -113
- re_common/baselibrary/utils/basepandas.py +302 -302
- re_common/baselibrary/utils/basepeewee.py +11 -11
- re_common/baselibrary/utils/basepika.py +180 -180
- re_common/baselibrary/utils/basepydash.py +143 -143
- re_common/baselibrary/utils/basepymongo.py +230 -230
- re_common/baselibrary/utils/basequeue.py +22 -22
- re_common/baselibrary/utils/baserar.py +57 -57
- re_common/baselibrary/utils/baserequest.py +279 -279
- re_common/baselibrary/utils/baseset.py +8 -8
- re_common/baselibrary/utils/basesmb.py +403 -403
- re_common/baselibrary/utils/basestring.py +382 -382
- re_common/baselibrary/utils/basetime.py +320 -320
- re_common/baselibrary/utils/baseurl.py +121 -121
- re_common/baselibrary/utils/basezip.py +57 -57
- re_common/baselibrary/utils/core/__init__.py +7 -7
- re_common/baselibrary/utils/core/bottomutils.py +18 -18
- re_common/baselibrary/utils/core/mdeprecated.py +327 -327
- re_common/baselibrary/utils/core/mlamada.py +16 -16
- re_common/baselibrary/utils/core/msginfo.py +25 -25
- re_common/baselibrary/utils/core/requests_core.py +103 -103
- re_common/baselibrary/utils/fateadm.py +429 -429
- re_common/baselibrary/utils/importfun.py +123 -123
- re_common/baselibrary/utils/mfaker.py +57 -57
- re_common/baselibrary/utils/my_abc/__init__.py +3 -3
- re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
- re_common/baselibrary/utils/mylogger.py +414 -414
- re_common/baselibrary/utils/myredisclient.py +861 -861
- re_common/baselibrary/utils/pipupgrade.py +21 -21
- re_common/baselibrary/utils/ringlist.py +85 -85
- re_common/baselibrary/utils/version_compare.py +36 -36
- re_common/baselibrary/utils/ydmhttp.py +126 -126
- re_common/facade/lazy_import.py +11 -11
- re_common/facade/loggerfacade.py +25 -25
- re_common/facade/mysqlfacade.py +467 -467
- re_common/facade/now.py +31 -31
- re_common/facade/sqlite3facade.py +257 -257
- re_common/facade/use/mq_use_facade.py +83 -83
- re_common/facade/use/proxy_use_facade.py +19 -19
- re_common/libtest/base_dict_test.py +19 -19
- re_common/libtest/baseavro_test.py +13 -13
- re_common/libtest/basefile_test.py +14 -14
- re_common/libtest/basemssql_test.py +77 -77
- re_common/libtest/baseodbc_test.py +7 -7
- re_common/libtest/basepandas_test.py +38 -38
- re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
- re_common/libtest/get_attr_test/settings.py +54 -54
- re_common/libtest/idencode_test.py +53 -53
- re_common/libtest/iniconfig_test.py +35 -35
- re_common/libtest/ip_test.py +34 -34
- re_common/libtest/merge_file_test.py +20 -20
- re_common/libtest/mfaker_test.py +8 -8
- re_common/libtest/mm3_test.py +31 -31
- re_common/libtest/mylogger_test.py +88 -88
- re_common/libtest/myparsel_test.py +27 -27
- re_common/libtest/mysql_test.py +151 -151
- re_common/libtest/pymongo_test.py +21 -21
- re_common/libtest/split_test.py +11 -11
- re_common/libtest/sqlite3_merge_test.py +5 -5
- re_common/libtest/sqlite3_test.py +34 -34
- re_common/libtest/tomlconfig_test.py +30 -30
- re_common/libtest/use_tools_test/__init__.py +2 -2
- re_common/libtest/user/__init__.py +4 -4
- re_common/studio/__init__.py +4 -4
- re_common/studio/assignment_expressions.py +36 -36
- re_common/studio/mydash/test1.py +18 -18
- re_common/studio/pydashstudio/first.py +9 -9
- re_common/studio/streamlitstudio/first_app.py +65 -65
- re_common/studio/streamlitstudio/uber_pickups.py +23 -23
- re_common/studio/test.py +18 -18
- re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +235 -220
- re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
- re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
- re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
- re_common/v2/baselibrary/decorators/utils.py +59 -59
- re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
- re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
- re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
- re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
- re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
- re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
- re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
- re_common/v2/baselibrary/tools/concurrency.py +35 -35
- re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
- re_common/v2/baselibrary/tools/data_processer/data_processer.py +497 -508
- re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
- re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
- re_common/v2/baselibrary/tools/dict_tools.py +44 -44
- re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
- re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
- re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
- re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
- re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
- re_common/v2/baselibrary/tools/list_tools.py +69 -69
- re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
- re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
- re_common/v2/baselibrary/tools/text_matcher.py +326 -326
- re_common/v2/baselibrary/tools/tree_processor/__init__.py +0 -0
- re_common/v2/baselibrary/tools/tree_processor/builder.py +25 -0
- re_common/v2/baselibrary/tools/tree_processor/node.py +13 -0
- re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
- re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
- re_common/v2/baselibrary/utils/author_smi.py +361 -361
- re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
- re_common/v2/baselibrary/utils/basedict.py +37 -37
- re_common/v2/baselibrary/utils/basehdfs.py +163 -163
- re_common/v2/baselibrary/utils/basepika.py +180 -180
- re_common/v2/baselibrary/utils/basetime.py +94 -77
- re_common/v2/baselibrary/utils/db.py +174 -156
- re_common/v2/baselibrary/utils/elasticsearch.py +46 -0
- re_common/v2/baselibrary/utils/json_cls.py +16 -16
- re_common/v2/baselibrary/utils/mq.py +83 -83
- re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
- re_common/v2/baselibrary/utils/string_bool.py +187 -186
- re_common/v2/baselibrary/utils/string_clear.py +246 -246
- re_common/v2/baselibrary/utils/string_smi.py +18 -18
- re_common/v2/baselibrary/utils/stringutils.py +312 -271
- re_common/vip/base_step_process.py +11 -11
- re_common/vip/baseencodeid.py +90 -90
- re_common/vip/changetaskname.py +28 -28
- re_common/vip/core_var.py +24 -24
- re_common/vip/mmh3Hash.py +89 -89
- re_common/vip/proxy/allproxys.py +127 -127
- re_common/vip/proxy/allproxys_thread.py +159 -159
- re_common/vip/proxy/cnki_proxy.py +153 -153
- re_common/vip/proxy/kuaidaili.py +87 -87
- re_common/vip/proxy/proxy_all.py +113 -113
- re_common/vip/proxy/update_kuaidaili_0.py +42 -42
- re_common/vip/proxy/wanfang_proxy.py +152 -152
- re_common/vip/proxy/wp_proxy_all.py +181 -181
- re_common/vip/read_rawid_to_txt.py +91 -91
- re_common/vip/title/__init__.py +5 -5
- re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
- re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
- re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
- re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
- re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
- re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
- re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
- re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
- re_common/vip/title/transform/__init__.py +10 -10
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/LICENSE +201 -201
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/METADATA +16 -16
- re_common-10.0.41.dist-info/RECORD +252 -0
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/WHEEL +1 -1
- re_common-10.0.39.dist-info/RECORD +0 -248
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from re_common.v2.baselibrary.tools.tree_processor.node import TreeNode
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def build_forest(node_list):
|
|
5
|
+
nodes = {} # cid -> TreeNode
|
|
6
|
+
has_parent = set()
|
|
7
|
+
|
|
8
|
+
# 第一步:创建所有节点
|
|
9
|
+
for cid, pid, count in node_list:
|
|
10
|
+
node = TreeNode(cid, count)
|
|
11
|
+
nodes[cid] = node
|
|
12
|
+
if pid is not None:
|
|
13
|
+
has_parent.add(cid)
|
|
14
|
+
|
|
15
|
+
# 第二步:连接 parent-child
|
|
16
|
+
for cid, pid, _ in node_list:
|
|
17
|
+
if pid is not None and pid in nodes:
|
|
18
|
+
parent = nodes[pid]
|
|
19
|
+
child = nodes[cid]
|
|
20
|
+
parent.children.append(child)
|
|
21
|
+
child.parent = parent
|
|
22
|
+
|
|
23
|
+
# 第三步:找所有根节点(即没有 parent 的)
|
|
24
|
+
roots = [node for cid, node in nodes.items() if node.parent is None]
|
|
25
|
+
return roots # 返回多棵树的根节点列表
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
class TreeNode:
|
|
2
|
+
def __init__(self, cid, count):
|
|
3
|
+
self.id = cid
|
|
4
|
+
self.count = count
|
|
5
|
+
self.children = []
|
|
6
|
+
self.parent = None
|
|
7
|
+
|
|
8
|
+
def add_child(self, child):
|
|
9
|
+
self.children.append(child)
|
|
10
|
+
child.parent = self
|
|
11
|
+
|
|
12
|
+
def is_leaf(self):
|
|
13
|
+
return len(self.children) == 0
|
|
@@ -1,60 +1,60 @@
|
|
|
1
|
-
"""
|
|
2
|
-
并查集(Union-Find)是一种用于管理元素分组的数据结构,主要用于解决动态连通性问题。它支持以下两种核心操作:
|
|
3
|
-
|
|
4
|
-
查找(Find):确定某个元素属于哪个集合。
|
|
5
|
-
|
|
6
|
-
合并(Union):将两个集合合并为一个集合。
|
|
7
|
-
|
|
8
|
-
并查集广泛应用于图论、网络连接、社交网络分析、图像处理等领域。
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class UnionFind:
|
|
13
|
-
def __init__(self):
|
|
14
|
-
"""
|
|
15
|
-
初始化并查集。
|
|
16
|
-
使用字典动态存储 parent 和 rank。
|
|
17
|
-
"""
|
|
18
|
-
self.parent = {} # 存储每个元素的父节点,用于表示集合的树结构
|
|
19
|
-
self.rank = {} # 存储每个集合的秩(树的高度),用于优化合并操作
|
|
20
|
-
|
|
21
|
-
def find(self, x):
|
|
22
|
-
"""
|
|
23
|
-
查找元素 x 的根节点(路径压缩优化)。
|
|
24
|
-
如果元素不存在,则动态添加。
|
|
25
|
-
"""
|
|
26
|
-
if x not in self.parent: # 如果元素 x 不在 parent 字典中
|
|
27
|
-
self.parent[x] = x # 将 x 的父节点设置为自己(初始化)
|
|
28
|
-
self.rank[x] = 1 # 将 x 的秩初始化为 1
|
|
29
|
-
if self.parent[x] != x: # 如果 x 不是根节点(路径压缩优化)
|
|
30
|
-
self.parent[x] = self.find(self.parent[x]) # 递归查找根节点,并更新 x 的父节点
|
|
31
|
-
return self.parent[x] # 返回 x 的根节点
|
|
32
|
-
|
|
33
|
-
def union(self, x, y):
|
|
34
|
-
"""
|
|
35
|
-
合并元素 x 和 y 所在的集合(按秩合并优化)。
|
|
36
|
-
如果元素不存在,则动态添加。
|
|
37
|
-
"""
|
|
38
|
-
root_x = self.find(x) # 找到 x 的根节点
|
|
39
|
-
root_y = self.find(y) # 找到 y 的根节点
|
|
40
|
-
if root_x != root_y: # 如果 x 和 y 不在同一个集合中
|
|
41
|
-
# 按秩合并
|
|
42
|
-
if self.rank[root_x] > self.rank[root_y]: # 如果 x 所在集合的秩更大
|
|
43
|
-
self.parent[root_y] = root_x # 将 y 的根节点指向 x 的根节点
|
|
44
|
-
elif self.rank[root_x] < self.rank[root_y]: # 如果 y 所在集合的秩更大
|
|
45
|
-
self.parent[root_x] = root_y # 将 x 的根节点指向 y 的根节点
|
|
46
|
-
else: # 如果两个集合的秩相等
|
|
47
|
-
self.parent[root_y] = root_x # 将 y 的根节点指向 x 的根节点
|
|
48
|
-
self.rank[root_x] += 1 # 增加 x 所在集合的秩
|
|
49
|
-
|
|
50
|
-
def get_groups(self):
|
|
51
|
-
"""
|
|
52
|
-
获取所有分组,返回一个字典,键为根节点,值为该组的所有元素。
|
|
53
|
-
"""
|
|
54
|
-
groups = {} # 初始化一个空字典,用于存储分组
|
|
55
|
-
for x in self.parent: # 遍历所有元素
|
|
56
|
-
root = self.find(x) # 找到当前元素的根节点
|
|
57
|
-
if root not in groups: # 如果根节点不在 groups 字典中
|
|
58
|
-
groups[root] = [] # 初始化一个空列表
|
|
59
|
-
groups[root].append(x) # 将当前元素添加到对应根节点的列表中
|
|
60
|
-
return groups # 返回分组结果
|
|
1
|
+
"""
|
|
2
|
+
并查集(Union-Find)是一种用于管理元素分组的数据结构,主要用于解决动态连通性问题。它支持以下两种核心操作:
|
|
3
|
+
|
|
4
|
+
查找(Find):确定某个元素属于哪个集合。
|
|
5
|
+
|
|
6
|
+
合并(Union):将两个集合合并为一个集合。
|
|
7
|
+
|
|
8
|
+
并查集广泛应用于图论、网络连接、社交网络分析、图像处理等领域。
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class UnionFind:
|
|
13
|
+
def __init__(self):
|
|
14
|
+
"""
|
|
15
|
+
初始化并查集。
|
|
16
|
+
使用字典动态存储 parent 和 rank。
|
|
17
|
+
"""
|
|
18
|
+
self.parent = {} # 存储每个元素的父节点,用于表示集合的树结构
|
|
19
|
+
self.rank = {} # 存储每个集合的秩(树的高度),用于优化合并操作
|
|
20
|
+
|
|
21
|
+
def find(self, x):
|
|
22
|
+
"""
|
|
23
|
+
查找元素 x 的根节点(路径压缩优化)。
|
|
24
|
+
如果元素不存在,则动态添加。
|
|
25
|
+
"""
|
|
26
|
+
if x not in self.parent: # 如果元素 x 不在 parent 字典中
|
|
27
|
+
self.parent[x] = x # 将 x 的父节点设置为自己(初始化)
|
|
28
|
+
self.rank[x] = 1 # 将 x 的秩初始化为 1
|
|
29
|
+
if self.parent[x] != x: # 如果 x 不是根节点(路径压缩优化)
|
|
30
|
+
self.parent[x] = self.find(self.parent[x]) # 递归查找根节点,并更新 x 的父节点
|
|
31
|
+
return self.parent[x] # 返回 x 的根节点
|
|
32
|
+
|
|
33
|
+
def union(self, x, y):
|
|
34
|
+
"""
|
|
35
|
+
合并元素 x 和 y 所在的集合(按秩合并优化)。
|
|
36
|
+
如果元素不存在,则动态添加。
|
|
37
|
+
"""
|
|
38
|
+
root_x = self.find(x) # 找到 x 的根节点
|
|
39
|
+
root_y = self.find(y) # 找到 y 的根节点
|
|
40
|
+
if root_x != root_y: # 如果 x 和 y 不在同一个集合中
|
|
41
|
+
# 按秩合并
|
|
42
|
+
if self.rank[root_x] > self.rank[root_y]: # 如果 x 所在集合的秩更大
|
|
43
|
+
self.parent[root_y] = root_x # 将 y 的根节点指向 x 的根节点
|
|
44
|
+
elif self.rank[root_x] < self.rank[root_y]: # 如果 y 所在集合的秩更大
|
|
45
|
+
self.parent[root_x] = root_y # 将 x 的根节点指向 y 的根节点
|
|
46
|
+
else: # 如果两个集合的秩相等
|
|
47
|
+
self.parent[root_y] = root_x # 将 y 的根节点指向 x 的根节点
|
|
48
|
+
self.rank[root_x] += 1 # 增加 x 所在集合的秩
|
|
49
|
+
|
|
50
|
+
def get_groups(self):
|
|
51
|
+
"""
|
|
52
|
+
获取所有分组,返回一个字典,键为根节点,值为该组的所有元素。
|
|
53
|
+
"""
|
|
54
|
+
groups = {} # 初始化一个空字典,用于存储分组
|
|
55
|
+
for x in self.parent: # 遍历所有元素
|
|
56
|
+
root = self.find(x) # 找到当前元素的根节点
|
|
57
|
+
if root not in groups: # 如果根节点不在 groups 字典中
|
|
58
|
+
groups[root] = [] # 初始化一个空列表
|
|
59
|
+
groups[root].append(x) # 将当前元素添加到对应根节点的列表中
|
|
60
|
+
return groups # 返回分组结果
|
|
@@ -1,196 +1,196 @@
|
|
|
1
|
-
# 某些业务中的字符串处理 算是特定场景的工具 不算通用工具
|
|
2
|
-
import re
|
|
3
|
-
|
|
4
|
-
from re_common.v2.baselibrary.utils.string_bool import is_all_symbols
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def clean_organ_postcode(organ):
|
|
8
|
-
"""
|
|
9
|
-
格式化组织名称字符串,移除括号内容并删除独立的6位数字(邮政编码),然后清理标点。
|
|
10
|
-
|
|
11
|
-
备注: 该方法替换java 里面的 formatOrgan
|
|
12
|
-
|
|
13
|
-
参数:
|
|
14
|
-
organ (str): 输入的组织名称字符串,可能包含括号、分号和邮政编码。
|
|
15
|
-
|
|
16
|
-
返回:
|
|
17
|
-
str: 格式化并清理后的组织名称字符串(无独立6位数字)。
|
|
18
|
-
"""
|
|
19
|
-
# 如果输入为空,设为空字符串以避免后续操作报错
|
|
20
|
-
if not organ:
|
|
21
|
-
organ = ""
|
|
22
|
-
|
|
23
|
-
# 删除方括号和圆括号中的内容(包括括号本身)
|
|
24
|
-
organ = re.sub(r"\[.*?\]", "", organ) # 非贪婪匹配方括号内容
|
|
25
|
-
organ = re.sub(r"\(.*?\)", "", organ) # 非贪婪匹配圆括号内容
|
|
26
|
-
|
|
27
|
-
# 定义正则表达式,匹配独立的6位数字
|
|
28
|
-
# \b 表示单词边界,确保6位数字是独立的(前后不是字母、数字或下划线)
|
|
29
|
-
organ = re.sub(r"\b[0-9]{6}\b", "", organ)
|
|
30
|
-
|
|
31
|
-
# 初始化结果列表,用于存储处理后的组织名称部分
|
|
32
|
-
format_organ = []
|
|
33
|
-
# 按分号分割字符串,生成组织名称的各个部分
|
|
34
|
-
organ_parts = organ.split(";")
|
|
35
|
-
|
|
36
|
-
# 遍历每个部分,追加到结果列表
|
|
37
|
-
for temp_organ in organ_parts:
|
|
38
|
-
# 去除首尾多余空格后追加(避免因移除邮编导致的空字符串)
|
|
39
|
-
cleaned_part = temp_organ.strip()
|
|
40
|
-
# 如果首尾是标点符号,则移除
|
|
41
|
-
# 定义标点符号的正则表达式(这里包括常见标点)
|
|
42
|
-
punctuation = r"^[!,.?;:#$%^&*+-]+|[!,.?;:#$%^&*+-]+$"
|
|
43
|
-
cleaned_part = re.sub(punctuation, "", cleaned_part)
|
|
44
|
-
if cleaned_part: # 只追加非空部分
|
|
45
|
-
format_organ.append(cleaned_part)
|
|
46
|
-
|
|
47
|
-
# 用分号连接结果,转换为大写并清理标点
|
|
48
|
-
format_organ = ";".join(format_organ)
|
|
49
|
-
|
|
50
|
-
# 返回最终结果并去除首尾空格
|
|
51
|
-
return format_organ.strip()
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def get_first_organ(organ):
|
|
55
|
-
if not organ:
|
|
56
|
-
return ""
|
|
57
|
-
organ_list = organ.strip().split(";")
|
|
58
|
-
for organ_one in organ_list:
|
|
59
|
-
# 清理邮政编码
|
|
60
|
-
organ_one = clean_organ_postcode(organ_one)
|
|
61
|
-
if organ_one.strip():
|
|
62
|
-
return organ_one
|
|
63
|
-
|
|
64
|
-
return ""
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def get_first_author(author: str) -> str:
|
|
68
|
-
if not author:
|
|
69
|
-
return ""
|
|
70
|
-
au_list = author.strip().split(";")
|
|
71
|
-
for au in au_list:
|
|
72
|
-
au = re.sub("\\[.*?]", "", au)
|
|
73
|
-
au = re.sub("\\(.*?\\)", "", au)
|
|
74
|
-
if au.strip():
|
|
75
|
-
return au
|
|
76
|
-
return ""
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def get_author_list(author: str):
|
|
80
|
-
lists = []
|
|
81
|
-
if not author:
|
|
82
|
-
return []
|
|
83
|
-
au_list = author.strip().split(";")
|
|
84
|
-
for au in au_list:
|
|
85
|
-
au = re.sub("\\[.*?]", "", au)
|
|
86
|
-
au = re.sub("\\(.*?\\)", "", au)
|
|
87
|
-
if au.strip():
|
|
88
|
-
lists.append(au.strip())
|
|
89
|
-
return lists
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def get_scopus_author_abbr(author_row: str):
|
|
93
|
-
if not author_row:
|
|
94
|
-
return ""
|
|
95
|
-
author_list = author_row.split("&&")
|
|
96
|
-
if len(author_list) != 3:
|
|
97
|
-
raise Exception("错误的数据个数 可能来自其他数据源")
|
|
98
|
-
|
|
99
|
-
abbr_list = author_list[0].strip().split(";")
|
|
100
|
-
abbr_list = [author.strip() for author in abbr_list if
|
|
101
|
-
author.strip() and author.strip().lower() not in ("*", "and")]
|
|
102
|
-
return ";".join(abbr_list)
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
def get_wos_author_abbr(author_row: str):
|
|
106
|
-
if not author_row:
|
|
107
|
-
return ""
|
|
108
|
-
author_list = author_row.split("&&")
|
|
109
|
-
if len(author_list) != 4:
|
|
110
|
-
raise Exception("错误的数据个数 可能来自其他数据源")
|
|
111
|
-
abbr_list = []
|
|
112
|
-
abbr_list_au = author_list[0].strip().split(";")
|
|
113
|
-
abbr_list_ba = author_list[2].strip().split(";")
|
|
114
|
-
abbr_list.extend(abbr_list_au)
|
|
115
|
-
abbr_list.extend(abbr_list_ba)
|
|
116
|
-
abbr_list = [author.strip() for author in abbr_list if
|
|
117
|
-
author.strip() and author.strip().lower() not in ("*", "and")]
|
|
118
|
-
return ";".join(abbr_list)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
def deal_rel_vol(vol_str: str):
|
|
122
|
-
"""
|
|
123
|
-
处理 期刊融合时的卷处理逻辑
|
|
124
|
-
"""
|
|
125
|
-
|
|
126
|
-
# 如果卷是全符号 清理掉
|
|
127
|
-
if is_all_symbols(vol_str):
|
|
128
|
-
vol_str = ""
|
|
129
|
-
|
|
130
|
-
if vol_str.replace(".", "").isdigit():
|
|
131
|
-
try:
|
|
132
|
-
float_num = float(vol_str)
|
|
133
|
-
if int(float_num) == float_num:
|
|
134
|
-
return str(int(float_num))
|
|
135
|
-
except:
|
|
136
|
-
pass
|
|
137
|
-
|
|
138
|
-
if vol_str.lower().startswith("v "):
|
|
139
|
-
vol_str = vol_str.lower().replace("v ", "").strip()
|
|
140
|
-
return vol_str
|
|
141
|
-
if vol_str.lower().startswith("volume "):
|
|
142
|
-
vol_str = vol_str.lower().replace("volume ", "").strip()
|
|
143
|
-
return vol_str
|
|
144
|
-
if vol_str.lower().startswith("vol. "):
|
|
145
|
-
vol_str = vol_str.lower().replace("vol. ", "").strip()
|
|
146
|
-
return vol_str
|
|
147
|
-
if vol_str.lower().startswith("vol "):
|
|
148
|
-
vol_str = vol_str.lower().replace("vol ", "").strip()
|
|
149
|
-
return vol_str
|
|
150
|
-
return vol_str
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
def deal_num_strs(input_str):
|
|
154
|
-
"""
|
|
155
|
-
int后在str 防止有浮点型的表达方式
|
|
156
|
-
"""
|
|
157
|
-
number_list = re.findall(r'\d+', input_str)
|
|
158
|
-
transformed_numbers = [str(int(num)) for num in number_list]
|
|
159
|
-
|
|
160
|
-
# 替换原字符串中的数字为转换后的数字
|
|
161
|
-
for num, transformed_num in zip(number_list, transformed_numbers):
|
|
162
|
-
input_str = input_str.replace(num, transformed_num)
|
|
163
|
-
return input_str
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
def deal_num(num_str):
|
|
167
|
-
"""
|
|
168
|
-
将 期格式化 方便 group尤其是有横杆的数据
|
|
169
|
-
该方法 为融合二次分割时使用,如果场景合适也可以用于其他地方
|
|
170
|
-
:param strs:
|
|
171
|
-
:return:
|
|
172
|
-
"""
|
|
173
|
-
# 如果期是全符号清理掉
|
|
174
|
-
if is_all_symbols(num_str):
|
|
175
|
-
num_str = ""
|
|
176
|
-
|
|
177
|
-
if num_str.lower().startswith("n "):
|
|
178
|
-
num_str = num_str.lower().replace("n ", "").strip()
|
|
179
|
-
|
|
180
|
-
num_str = num_str.lower().replace("special_issue_", '').replace("_special_issue", '').replace("issue", "")
|
|
181
|
-
num_str = num_str.replace("spec.", "").replace("iss.", "").replace("spl.", "").replace("special.", "").replace(
|
|
182
|
-
"specialissue.", "")
|
|
183
|
-
num_str = num_str.replace("spec", "").replace("iss", "").replace("spl", "").replace("special", "").replace(
|
|
184
|
-
"specialissue", '')
|
|
185
|
-
|
|
186
|
-
num_str = num_str.replace("-", "_").replace(".", "_").upper()
|
|
187
|
-
num_str = num_str.lstrip("_").rstrip("_")
|
|
188
|
-
if num_str.find("_") > -1:
|
|
189
|
-
start, end = num_str.split("_")
|
|
190
|
-
start = deal_num_strs(start)
|
|
191
|
-
end = deal_num_strs(end)
|
|
192
|
-
num_str = start + "_" + end
|
|
193
|
-
else:
|
|
194
|
-
num_str = deal_num_strs(num_str)
|
|
195
|
-
|
|
196
|
-
return num_str.lower().strip()
|
|
1
|
+
# 某些业务中的字符串处理 算是特定场景的工具 不算通用工具
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from re_common.v2.baselibrary.utils.string_bool import is_all_symbols
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def clean_organ_postcode(organ):
|
|
8
|
+
"""
|
|
9
|
+
格式化组织名称字符串,移除括号内容并删除独立的6位数字(邮政编码),然后清理标点。
|
|
10
|
+
|
|
11
|
+
备注: 该方法替换java 里面的 formatOrgan
|
|
12
|
+
|
|
13
|
+
参数:
|
|
14
|
+
organ (str): 输入的组织名称字符串,可能包含括号、分号和邮政编码。
|
|
15
|
+
|
|
16
|
+
返回:
|
|
17
|
+
str: 格式化并清理后的组织名称字符串(无独立6位数字)。
|
|
18
|
+
"""
|
|
19
|
+
# 如果输入为空,设为空字符串以避免后续操作报错
|
|
20
|
+
if not organ:
|
|
21
|
+
organ = ""
|
|
22
|
+
|
|
23
|
+
# 删除方括号和圆括号中的内容(包括括号本身)
|
|
24
|
+
organ = re.sub(r"\[.*?\]", "", organ) # 非贪婪匹配方括号内容
|
|
25
|
+
organ = re.sub(r"\(.*?\)", "", organ) # 非贪婪匹配圆括号内容
|
|
26
|
+
|
|
27
|
+
# 定义正则表达式,匹配独立的6位数字
|
|
28
|
+
# \b 表示单词边界,确保6位数字是独立的(前后不是字母、数字或下划线)
|
|
29
|
+
organ = re.sub(r"\b[0-9]{6}\b", "", organ)
|
|
30
|
+
|
|
31
|
+
# 初始化结果列表,用于存储处理后的组织名称部分
|
|
32
|
+
format_organ = []
|
|
33
|
+
# 按分号分割字符串,生成组织名称的各个部分
|
|
34
|
+
organ_parts = organ.split(";")
|
|
35
|
+
|
|
36
|
+
# 遍历每个部分,追加到结果列表
|
|
37
|
+
for temp_organ in organ_parts:
|
|
38
|
+
# 去除首尾多余空格后追加(避免因移除邮编导致的空字符串)
|
|
39
|
+
cleaned_part = temp_organ.strip()
|
|
40
|
+
# 如果首尾是标点符号,则移除
|
|
41
|
+
# 定义标点符号的正则表达式(这里包括常见标点)
|
|
42
|
+
punctuation = r"^[!,.?;:#$%^&*+-]+|[!,.?;:#$%^&*+-]+$"
|
|
43
|
+
cleaned_part = re.sub(punctuation, "", cleaned_part)
|
|
44
|
+
if cleaned_part: # 只追加非空部分
|
|
45
|
+
format_organ.append(cleaned_part)
|
|
46
|
+
|
|
47
|
+
# 用分号连接结果,转换为大写并清理标点
|
|
48
|
+
format_organ = ";".join(format_organ)
|
|
49
|
+
|
|
50
|
+
# 返回最终结果并去除首尾空格
|
|
51
|
+
return format_organ.strip()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def get_first_organ(organ):
|
|
55
|
+
if not organ:
|
|
56
|
+
return ""
|
|
57
|
+
organ_list = organ.strip().split(";")
|
|
58
|
+
for organ_one in organ_list:
|
|
59
|
+
# 清理邮政编码
|
|
60
|
+
organ_one = clean_organ_postcode(organ_one)
|
|
61
|
+
if organ_one.strip():
|
|
62
|
+
return organ_one
|
|
63
|
+
|
|
64
|
+
return ""
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_first_author(author: str) -> str:
|
|
68
|
+
if not author:
|
|
69
|
+
return ""
|
|
70
|
+
au_list = author.strip().split(";")
|
|
71
|
+
for au in au_list:
|
|
72
|
+
au = re.sub("\\[.*?]", "", au)
|
|
73
|
+
au = re.sub("\\(.*?\\)", "", au)
|
|
74
|
+
if au.strip():
|
|
75
|
+
return au
|
|
76
|
+
return ""
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def get_author_list(author: str):
|
|
80
|
+
lists = []
|
|
81
|
+
if not author:
|
|
82
|
+
return []
|
|
83
|
+
au_list = author.strip().split(";")
|
|
84
|
+
for au in au_list:
|
|
85
|
+
au = re.sub("\\[.*?]", "", au)
|
|
86
|
+
au = re.sub("\\(.*?\\)", "", au)
|
|
87
|
+
if au.strip():
|
|
88
|
+
lists.append(au.strip())
|
|
89
|
+
return lists
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_scopus_author_abbr(author_row: str):
|
|
93
|
+
if not author_row:
|
|
94
|
+
return ""
|
|
95
|
+
author_list = author_row.split("&&")
|
|
96
|
+
if len(author_list) != 3:
|
|
97
|
+
raise Exception("错误的数据个数 可能来自其他数据源")
|
|
98
|
+
|
|
99
|
+
abbr_list = author_list[0].strip().split(";")
|
|
100
|
+
abbr_list = [author.strip() for author in abbr_list if
|
|
101
|
+
author.strip() and author.strip().lower() not in ("*", "and")]
|
|
102
|
+
return ";".join(abbr_list)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def get_wos_author_abbr(author_row: str):
|
|
106
|
+
if not author_row:
|
|
107
|
+
return ""
|
|
108
|
+
author_list = author_row.split("&&")
|
|
109
|
+
if len(author_list) != 4:
|
|
110
|
+
raise Exception("错误的数据个数 可能来自其他数据源")
|
|
111
|
+
abbr_list = []
|
|
112
|
+
abbr_list_au = author_list[0].strip().split(";")
|
|
113
|
+
abbr_list_ba = author_list[2].strip().split(";")
|
|
114
|
+
abbr_list.extend(abbr_list_au)
|
|
115
|
+
abbr_list.extend(abbr_list_ba)
|
|
116
|
+
abbr_list = [author.strip() for author in abbr_list if
|
|
117
|
+
author.strip() and author.strip().lower() not in ("*", "and")]
|
|
118
|
+
return ";".join(abbr_list)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def deal_rel_vol(vol_str: str):
|
|
122
|
+
"""
|
|
123
|
+
处理 期刊融合时的卷处理逻辑
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
# 如果卷是全符号 清理掉
|
|
127
|
+
if is_all_symbols(vol_str):
|
|
128
|
+
vol_str = ""
|
|
129
|
+
|
|
130
|
+
if vol_str.replace(".", "").isdigit():
|
|
131
|
+
try:
|
|
132
|
+
float_num = float(vol_str)
|
|
133
|
+
if int(float_num) == float_num:
|
|
134
|
+
return str(int(float_num))
|
|
135
|
+
except:
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
if vol_str.lower().startswith("v "):
|
|
139
|
+
vol_str = vol_str.lower().replace("v ", "").strip()
|
|
140
|
+
return vol_str
|
|
141
|
+
if vol_str.lower().startswith("volume "):
|
|
142
|
+
vol_str = vol_str.lower().replace("volume ", "").strip()
|
|
143
|
+
return vol_str
|
|
144
|
+
if vol_str.lower().startswith("vol. "):
|
|
145
|
+
vol_str = vol_str.lower().replace("vol. ", "").strip()
|
|
146
|
+
return vol_str
|
|
147
|
+
if vol_str.lower().startswith("vol "):
|
|
148
|
+
vol_str = vol_str.lower().replace("vol ", "").strip()
|
|
149
|
+
return vol_str
|
|
150
|
+
return vol_str
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def deal_num_strs(input_str):
|
|
154
|
+
"""
|
|
155
|
+
int后在str 防止有浮点型的表达方式
|
|
156
|
+
"""
|
|
157
|
+
number_list = re.findall(r'\d+', input_str)
|
|
158
|
+
transformed_numbers = [str(int(num)) for num in number_list]
|
|
159
|
+
|
|
160
|
+
# 替换原字符串中的数字为转换后的数字
|
|
161
|
+
for num, transformed_num in zip(number_list, transformed_numbers):
|
|
162
|
+
input_str = input_str.replace(num, transformed_num)
|
|
163
|
+
return input_str
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def deal_num(num_str):
|
|
167
|
+
"""
|
|
168
|
+
将 期格式化 方便 group尤其是有横杆的数据
|
|
169
|
+
该方法 为融合二次分割时使用,如果场景合适也可以用于其他地方
|
|
170
|
+
:param strs:
|
|
171
|
+
:return:
|
|
172
|
+
"""
|
|
173
|
+
# 如果期是全符号清理掉
|
|
174
|
+
if is_all_symbols(num_str):
|
|
175
|
+
num_str = ""
|
|
176
|
+
|
|
177
|
+
if num_str.lower().startswith("n "):
|
|
178
|
+
num_str = num_str.lower().replace("n ", "").strip()
|
|
179
|
+
|
|
180
|
+
num_str = num_str.lower().replace("special_issue_", '').replace("_special_issue", '').replace("issue", "")
|
|
181
|
+
num_str = num_str.replace("spec.", "").replace("iss.", "").replace("spl.", "").replace("special.", "").replace(
|
|
182
|
+
"specialissue.", "")
|
|
183
|
+
num_str = num_str.replace("spec", "").replace("iss", "").replace("spl", "").replace("special", "").replace(
|
|
184
|
+
"specialissue", '')
|
|
185
|
+
|
|
186
|
+
num_str = num_str.replace("-", "_").replace(".", "_").upper()
|
|
187
|
+
num_str = num_str.lstrip("_").rstrip("_")
|
|
188
|
+
if num_str.find("_") > -1:
|
|
189
|
+
start, end = num_str.split("_")
|
|
190
|
+
start = deal_num_strs(start)
|
|
191
|
+
end = deal_num_strs(end)
|
|
192
|
+
num_str = start + "_" + end
|
|
193
|
+
else:
|
|
194
|
+
num_str = deal_num_strs(num_str)
|
|
195
|
+
|
|
196
|
+
return num_str.lower().strip()
|