re-common 0.2.53__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/baselibrary/tools/image_to_pdf.py +61 -61
- re_common/baselibrary/utils/baseboto3.py +102 -8
- re_common/baselibrary/utils/core/requests_core.py +103 -103
- re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
- re_common/v2/__init__.py +0 -0
- re_common/v2/baselibrary/__init__.py +0 -0
- re_common/v2/baselibrary/s3object/__init__.py +0 -0
- re_common/v2/baselibrary/s3object/baseboto3.py +230 -0
- re_common/v2/baselibrary/tools/__init__.py +0 -0
- re_common/v2/baselibrary/tools/dict_tools.py +24 -0
- re_common/v2/baselibrary/tools/unionfind_tools.py +60 -0
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +74 -0
- re_common/v2/baselibrary/utils/__init__.py +0 -0
- re_common/v2/baselibrary/utils/basedict.py +26 -0
- re_common/v2/baselibrary/utils/basehdfs.py +127 -0
- re_common/v2/baselibrary/utils/json_cls.py +11 -0
- re_common/v2/baselibrary/utils/string_bool.py +9 -0
- re_common/v2/baselibrary/utils/string_clear.py +84 -0
- re_common/v2/baselibrary/utils/stringutils.py +60 -0
- {re_common-0.2.53.dist-info → re_common-2.0.0.dist-info}/METADATA +16 -20
- {re_common-0.2.53.dist-info → re_common-2.0.0.dist-info}/RECORD +24 -9
- {re_common-0.2.53.dist-info → re_common-2.0.0.dist-info}/WHEEL +1 -1
- {re_common-0.2.53.dist-info → re_common-2.0.0.dist-info}/LICENSE +0 -0
- {re_common-0.2.53.dist-info → re_common-2.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
from boto3.session import Session
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class BaseBoto3(object):
|
|
5
|
+
|
|
6
|
+
def __init__(self, aws_access_key_id="", aws_secret_access_key="", endpoint_url=""):
|
|
7
|
+
self.aws_access_key_id = aws_access_key_id
|
|
8
|
+
self.aws_secret_access_key = aws_secret_access_key
|
|
9
|
+
self.endpoint_url = endpoint_url
|
|
10
|
+
self.session = None
|
|
11
|
+
self.client = None
|
|
12
|
+
if self.aws_access_key_id and self.aws_secret_access_key and self.endpoint_url:
|
|
13
|
+
self.conn_session()
|
|
14
|
+
self.get_client()
|
|
15
|
+
|
|
16
|
+
def set_key(self, aws_access_key_id, aws_secret_access_key, endpoint_url):
|
|
17
|
+
self.aws_access_key_id = aws_access_key_id
|
|
18
|
+
self.aws_secret_access_key = aws_secret_access_key
|
|
19
|
+
self.endpoint_url = endpoint_url
|
|
20
|
+
return self
|
|
21
|
+
|
|
22
|
+
def conn_session(self):
|
|
23
|
+
assert self.aws_access_key_id not in (None, '')
|
|
24
|
+
assert self.aws_secret_access_key not in (None, '')
|
|
25
|
+
self.session = Session(aws_access_key_id=self.aws_access_key_id,
|
|
26
|
+
aws_secret_access_key=self.aws_secret_access_key)
|
|
27
|
+
return self.session
|
|
28
|
+
|
|
29
|
+
def get_client(self):
|
|
30
|
+
assert self.session is not None
|
|
31
|
+
self.client = self.session.client('s3', endpoint_url=self.endpoint_url)
|
|
32
|
+
return self
|
|
33
|
+
|
|
34
|
+
def get_all_buckets(self):
|
|
35
|
+
"""
|
|
36
|
+
获取所有的桶信息
|
|
37
|
+
:return:
|
|
38
|
+
"""
|
|
39
|
+
return self.client.list_buckets()
|
|
40
|
+
|
|
41
|
+
def create_buckets(self, buckets_name):
|
|
42
|
+
"""
|
|
43
|
+
如果get_client 使用 client 返回
|
|
44
|
+
{'ResponseMetadata': {'RequestId': '16BC90EED4A433C4', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders': {'accept-ranges': 'bytes', 'content-length': '0', 'content-security-policy': 'block-all-mixed-content', 'location': '/create1', 'server': 'MinIO', 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'vary': 'Origin, Accept-Encoding', 'x-amz-request-id': '16BC90EED4A433C4', 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'date': 'Wed, 01 Dec 2021 07:28:39 GMT'}, 'RetryAttempts': 0}, 'Location': '/create1'}
|
|
45
|
+
"""
|
|
46
|
+
assert buckets_name.find("_") == -1, "新建一个bucket桶(bucket name 中不能有_下划线)"
|
|
47
|
+
# 新建一个bucket桶(bucket name 中不能有_下划线)
|
|
48
|
+
return self.client.create_bucket(Bucket=buckets_name)
|
|
49
|
+
|
|
50
|
+
def delete_buckets(self, bucket_name):
|
|
51
|
+
"""
|
|
52
|
+
删除桶 删除bucket(只能删除空的bucket)
|
|
53
|
+
:return:
|
|
54
|
+
"""
|
|
55
|
+
response = self.client.delete_bucket(Bucket=bucket_name)
|
|
56
|
+
return response
|
|
57
|
+
|
|
58
|
+
def get_bucket(self, bucket_name):
|
|
59
|
+
raise Exception("无实现方法")
|
|
60
|
+
|
|
61
|
+
def get_all_objs(self, bucket_name, prefix=None, continuation_token=None):
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
continuation_token: 如果超过1000 需要传第一次获取结果中的 continuation_token
|
|
65
|
+
|
|
66
|
+
response 的结构
|
|
67
|
+
{'ResponseMetadata': {'RequestId': '1818F447C1E7BA3B', 'HostId': '', 'HTTPStatusCode': 200,
|
|
68
|
+
'HTTPHeaders': {'accept-ranges': 'bytes', 'content-length': '3182', 'content-security-policy': 'block-all-mixed-content', 'content-type': 'application/xml',
|
|
69
|
+
'server': 'MinIO', 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'vary': 'Origin, Accept-Encoding', 'x-amz-request-id': '1818F447C1E7BA3B',
|
|
70
|
+
'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'date': 'Thu, 09 Jan 2025 07:04:05 GMT'}, 'RetryAttempts': 0},
|
|
71
|
+
'IsTruncated': False, 'Contents':
|
|
72
|
+
[
|
|
73
|
+
{'Key': 'zt_file/zt类型样例数据/11_part-00000.gz', 'LastModified': datetime.datetime(2024, 4, 28, 2, 56, 59, 716000, tzinfo=tzutc()), 'ETag': '"e0d635f171bce6a67ad72265e5f9137d-2"',
|
|
74
|
+
'Size': 18164139, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'minio', 'ID': '02d6176db174dc93cb1b899f7c6078f08654445fe8cf1b6ce98d8855f66bdbf4'}},
|
|
75
|
+
{'Key': 'zt_file/zt类型样例数据/12_part-00000.gz', 'LastModified': datetime.datetime(2024, 4, 28, 2, 56, 57, 70000, tzinfo=tzutc()), 'ETag': '"f238fe9973a2bc0d3e1562c2938ce897-9"',
|
|
76
|
+
'Size': 93710911, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'minio', 'ID': '02d6176db174dc93cb1b899f7c6078f08654445fe8cf1b6ce98d8855f66bdbf4'}},
|
|
77
|
+
],
|
|
78
|
+
'Name': 'crawl.dc.cqvip.com', 'Prefix': 'zt_file/zt类型样例数据', 'Delimiter': '',
|
|
79
|
+
'MaxKeys': 1000, 'EncodingType': 'url', 'KeyCount': 7}
|
|
80
|
+
|
|
81
|
+
"""
|
|
82
|
+
if continuation_token:
|
|
83
|
+
# 获取桶中以特定前缀开头的所有对象
|
|
84
|
+
response = self.client.list_objects_v2(Bucket=bucket_name,
|
|
85
|
+
Prefix=prefix,
|
|
86
|
+
ContinuationToken=continuation_token)
|
|
87
|
+
else:
|
|
88
|
+
# 获取桶中以特定前缀开头的所有对象
|
|
89
|
+
response = self.client.list_objects_v2(Bucket=bucket_name,
|
|
90
|
+
Prefix=prefix)
|
|
91
|
+
object_list = []
|
|
92
|
+
# 检查是否有对象存在
|
|
93
|
+
if 'Contents' in response:
|
|
94
|
+
object_list = [obj['Key'] for obj in response['Contents']]
|
|
95
|
+
|
|
96
|
+
continuation_token = None
|
|
97
|
+
# 检查是否有更多对象
|
|
98
|
+
if response.get('IsTruncated'): # 如果返回结果被截断,说明有更多对象
|
|
99
|
+
continuation_token = response.get('NextContinuationToken')
|
|
100
|
+
|
|
101
|
+
return object_list, continuation_token
|
|
102
|
+
|
|
103
|
+
def list_prefixes(self, bucket_name, prefix=None, Delimiter="/", continuation_token=None):
|
|
104
|
+
"""
|
|
105
|
+
获取目录下一层的目录
|
|
106
|
+
prefix: 注意 这个要以 Delimiter 结尾 比如 Delimiter="/" 那么 prefix="a/"
|
|
107
|
+
continuation_token: 如果超过1000 需要传第一次获取结果中的 continuation_token
|
|
108
|
+
return: ['a/b/', 'a/c/'] 注意 反回的 结果带有prefix 只能返回目录 不能返回文件
|
|
109
|
+
"""
|
|
110
|
+
if continuation_token:
|
|
111
|
+
# 获取桶中以特定前缀开头的所有对象
|
|
112
|
+
response = self.client.list_objects_v2(Bucket=bucket_name,
|
|
113
|
+
Prefix=prefix,
|
|
114
|
+
Delimiter=Delimiter, # 使用斜杠分隔符模拟目录结构
|
|
115
|
+
ContinuationToken=continuation_token)
|
|
116
|
+
else:
|
|
117
|
+
# 获取桶中以特定前缀开头的所有对象
|
|
118
|
+
response = self.client.list_objects_v2(Bucket=bucket_name,
|
|
119
|
+
Delimiter=Delimiter, # 使用斜杠分隔符模拟目录结构
|
|
120
|
+
Prefix=prefix)
|
|
121
|
+
object_list = []
|
|
122
|
+
# 检查是否有对象存在
|
|
123
|
+
if 'Contents' in response:
|
|
124
|
+
object_list = [obj['Key'] for obj in response['Contents']]
|
|
125
|
+
|
|
126
|
+
Prefix_list = []
|
|
127
|
+
# 检查是否有目录存在
|
|
128
|
+
if 'CommonPrefixes' in response:
|
|
129
|
+
Prefix_list = [obj['Prefix'] for obj in response['CommonPrefixes']]
|
|
130
|
+
|
|
131
|
+
continuation_token = None
|
|
132
|
+
# 检查是否有更多对象
|
|
133
|
+
if response.get('IsTruncated'): # 如果返回结果被截断,说明有更多对象
|
|
134
|
+
continuation_token = response.get('NextContinuationToken')
|
|
135
|
+
|
|
136
|
+
return object_list, Prefix_list, continuation_token
|
|
137
|
+
|
|
138
|
+
def get_object_value(self, bucket_name, file_key, encoding='utf-8'):
|
|
139
|
+
"""
|
|
140
|
+
读取文本数据
|
|
141
|
+
Returns:
|
|
142
|
+
"""
|
|
143
|
+
obj = self.client.get_object(Bucket=bucket_name, Key=file_key)
|
|
144
|
+
body = obj['Body'].read().decode(encoding)
|
|
145
|
+
return body
|
|
146
|
+
|
|
147
|
+
def put_object(self, bucket_name, key, body):
|
|
148
|
+
"""
|
|
149
|
+
直接写内容到文件
|
|
150
|
+
Args:
|
|
151
|
+
bucket_name:
|
|
152
|
+
key:
|
|
153
|
+
body: 需要 编码 .encode('utf-8')
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
"""
|
|
157
|
+
self.client.put_object(Bucket=bucket_name,
|
|
158
|
+
Key=key,
|
|
159
|
+
Body=body)
|
|
160
|
+
|
|
161
|
+
def download_file(self, bucket_name, key, local_file):
|
|
162
|
+
"""
|
|
163
|
+
return: None
|
|
164
|
+
"""
|
|
165
|
+
result = self.client.download_file(bucket_name, key, local_file)
|
|
166
|
+
return result
|
|
167
|
+
|
|
168
|
+
def upload_file(self, bucket_name, key, local_file):
|
|
169
|
+
"""
|
|
170
|
+
# key 桶中的位置 test1/test.pdf
|
|
171
|
+
:param local_file: 本地文件路径
|
|
172
|
+
:param bucket_name: 桶名
|
|
173
|
+
:param key: 远程文件路径
|
|
174
|
+
:return:
|
|
175
|
+
"""
|
|
176
|
+
self.client.upload_file(local_file, bucket_name, key)
|
|
177
|
+
|
|
178
|
+
def download_fileobj(self, bucket_name, key, fileobj):
|
|
179
|
+
"""
|
|
180
|
+
return: None
|
|
181
|
+
"""
|
|
182
|
+
result = self.client.download_fileobj(bucket_name, key, fileobj)
|
|
183
|
+
return result
|
|
184
|
+
|
|
185
|
+
def upload_fileobj(self, bucket_name, key, fileobj):
|
|
186
|
+
# fileobj 字节流
|
|
187
|
+
self.client.upload_fileobj(fileobj, bucket_name, key)
|
|
188
|
+
|
|
189
|
+
def check_exist_or_file_info(self, bucket_name, key):
|
|
190
|
+
"""
|
|
191
|
+
检查文件是否存在且能获取文件info
|
|
192
|
+
{'ResponseMetadata': {'RequestId': '17E6A65A2B299D3B', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders':
|
|
193
|
+
{'accept-ranges': 'bytes', 'content-length': '117', 'content-security-policy': 'block-all-mixed-content', 'content-type': 'binary/octet-stream',
|
|
194
|
+
'etag': '"2237a934f176003e41abf3d733291079"', 'last-modified': 'Thu, 25 Jul 2024 05:49:43 GMT', 'server': 'MinIO',
|
|
195
|
+
'strict-transport-security': 'max-age=31536000; includeSubDomains', 'vary': 'Origin, Accept-Encoding', 'x-amz-request-id': '17E6A65A2B299D3B',
|
|
196
|
+
'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'date': 'Mon, 29 Jul 2024 09:53:33 GMT'}, 'RetryAttempts': 0},
|
|
197
|
+
'AcceptRanges': 'bytes', 'LastModified': datetime.datetime(2024, 7, 25, 5, 49, 43, tzinfo=tzutc()), 'ContentLength': 117, 'ETag': '"2237a934f176003e41abf3d733291079"',
|
|
198
|
+
'ContentType': 'binary/octet-stream', 'Metadata': {}}
|
|
199
|
+
"""
|
|
200
|
+
try:
|
|
201
|
+
obj_info = self.client.head_object(
|
|
202
|
+
Bucket=bucket_name,
|
|
203
|
+
Key=key
|
|
204
|
+
)
|
|
205
|
+
return obj_info
|
|
206
|
+
except:
|
|
207
|
+
return None
|
|
208
|
+
|
|
209
|
+
def get_prefix_count(self, bucket_name, obj_count, prefix, continuation_token=None):
|
|
210
|
+
"""
|
|
211
|
+
统计 某个目录的文件数据量,由于需要每个目录获取一次 性能很慢
|
|
212
|
+
"""
|
|
213
|
+
for index in range(10000):
|
|
214
|
+
obj_list, dir_list, token = self.list_prefixes(bucket_name=bucket_name,
|
|
215
|
+
prefix=prefix,
|
|
216
|
+
continuation_token=continuation_token)
|
|
217
|
+
|
|
218
|
+
obj_count = obj_count + len(obj_list)
|
|
219
|
+
for dir_sub in dir_list:
|
|
220
|
+
obj_count = self.get_prefix_count(bucket_name, obj_count, dir_sub)
|
|
221
|
+
|
|
222
|
+
if token:
|
|
223
|
+
continuation_token = token
|
|
224
|
+
else:
|
|
225
|
+
break
|
|
226
|
+
|
|
227
|
+
if index > 10000 - 5:
|
|
228
|
+
raise Exception("循环耗尽,请检查逻辑正确性")
|
|
229
|
+
|
|
230
|
+
return obj_count
|
|
File without changes
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
class DotDict(dict):
|
|
2
|
+
"""
|
|
3
|
+
让字典成为对象 既可以用字典方式访问 也可以用点访问key
|
|
4
|
+
"""
|
|
5
|
+
def __init__(self, *args, **kwargs):
|
|
6
|
+
super().__init__(*args, **kwargs)
|
|
7
|
+
# 递归地将嵌套字典转换为 DotDict
|
|
8
|
+
for key, value in self.items():
|
|
9
|
+
if isinstance(value, dict):
|
|
10
|
+
self[key] = DotDict(value)
|
|
11
|
+
|
|
12
|
+
def __getattr__(self, key):
|
|
13
|
+
try:
|
|
14
|
+
value = self[key]
|
|
15
|
+
if isinstance(value, dict): # 如果值是字典,继续转换为 DotDict
|
|
16
|
+
return DotDict(value)
|
|
17
|
+
return value
|
|
18
|
+
except KeyError:
|
|
19
|
+
raise AttributeError(f"'DotDict' object has no attribute '{key}'")
|
|
20
|
+
|
|
21
|
+
def __setattr__(self, key, value):
|
|
22
|
+
if isinstance(value, dict): # 如果值是字典,转换为 DotDict
|
|
23
|
+
value = DotDict(value)
|
|
24
|
+
self[key] = value
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""
|
|
2
|
+
并查集(Union-Find)是一种用于管理元素分组的数据结构,主要用于解决动态连通性问题。它支持以下两种核心操作:
|
|
3
|
+
|
|
4
|
+
查找(Find):确定某个元素属于哪个集合。
|
|
5
|
+
|
|
6
|
+
合并(Union):将两个集合合并为一个集合。
|
|
7
|
+
|
|
8
|
+
并查集广泛应用于图论、网络连接、社交网络分析、图像处理等领域。
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class UnionFind:
|
|
13
|
+
def __init__(self):
|
|
14
|
+
"""
|
|
15
|
+
初始化并查集。
|
|
16
|
+
使用字典动态存储 parent 和 rank。
|
|
17
|
+
"""
|
|
18
|
+
self.parent = {} # 存储每个元素的父节点,用于表示集合的树结构
|
|
19
|
+
self.rank = {} # 存储每个集合的秩(树的高度),用于优化合并操作
|
|
20
|
+
|
|
21
|
+
def find(self, x):
|
|
22
|
+
"""
|
|
23
|
+
查找元素 x 的根节点(路径压缩优化)。
|
|
24
|
+
如果元素不存在,则动态添加。
|
|
25
|
+
"""
|
|
26
|
+
if x not in self.parent: # 如果元素 x 不在 parent 字典中
|
|
27
|
+
self.parent[x] = x # 将 x 的父节点设置为自己(初始化)
|
|
28
|
+
self.rank[x] = 1 # 将 x 的秩初始化为 1
|
|
29
|
+
if self.parent[x] != x: # 如果 x 不是根节点(路径压缩优化)
|
|
30
|
+
self.parent[x] = self.find(self.parent[x]) # 递归查找根节点,并更新 x 的父节点
|
|
31
|
+
return self.parent[x] # 返回 x 的根节点
|
|
32
|
+
|
|
33
|
+
def union(self, x, y):
|
|
34
|
+
"""
|
|
35
|
+
合并元素 x 和 y 所在的集合(按秩合并优化)。
|
|
36
|
+
如果元素不存在,则动态添加。
|
|
37
|
+
"""
|
|
38
|
+
root_x = self.find(x) # 找到 x 的根节点
|
|
39
|
+
root_y = self.find(y) # 找到 y 的根节点
|
|
40
|
+
if root_x != root_y: # 如果 x 和 y 不在同一个集合中
|
|
41
|
+
# 按秩合并
|
|
42
|
+
if self.rank[root_x] > self.rank[root_y]: # 如果 x 所在集合的秩更大
|
|
43
|
+
self.parent[root_y] = root_x # 将 y 的根节点指向 x 的根节点
|
|
44
|
+
elif self.rank[root_x] < self.rank[root_y]: # 如果 y 所在集合的秩更大
|
|
45
|
+
self.parent[root_x] = root_y # 将 x 的根节点指向 y 的根节点
|
|
46
|
+
else: # 如果两个集合的秩相等
|
|
47
|
+
self.parent[root_y] = root_x # 将 y 的根节点指向 x 的根节点
|
|
48
|
+
self.rank[root_x] += 1 # 增加 x 所在集合的秩
|
|
49
|
+
|
|
50
|
+
def get_groups(self):
|
|
51
|
+
"""
|
|
52
|
+
获取所有分组,返回一个字典,键为根节点,值为该组的所有元素。
|
|
53
|
+
"""
|
|
54
|
+
groups = {} # 初始化一个空字典,用于存储分组
|
|
55
|
+
for x in self.parent: # 遍历所有元素
|
|
56
|
+
root = self.find(x) # 找到当前元素的根节点
|
|
57
|
+
if root not in groups: # 如果根节点不在 groups 字典中
|
|
58
|
+
groups[root] = [] # 初始化一个空列表
|
|
59
|
+
groups[root].append(x) # 将当前元素添加到对应根节点的列表中
|
|
60
|
+
return groups # 返回分组结果
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# 某些业务中的字符串处理 算是特定场景的工具
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def clean_organ_postcode(organ):
|
|
6
|
+
"""
|
|
7
|
+
格式化组织名称字符串,移除括号内容并删除独立的6位数字(邮政编码),然后清理标点。
|
|
8
|
+
|
|
9
|
+
备注: 该方法替换java 里面的 formatOrgan
|
|
10
|
+
|
|
11
|
+
参数:
|
|
12
|
+
organ (str): 输入的组织名称字符串,可能包含括号、分号和邮政编码。
|
|
13
|
+
|
|
14
|
+
返回:
|
|
15
|
+
str: 格式化并清理后的组织名称字符串(无独立6位数字)。
|
|
16
|
+
"""
|
|
17
|
+
# 如果输入为空,设为空字符串以避免后续操作报错
|
|
18
|
+
if not organ:
|
|
19
|
+
organ = ""
|
|
20
|
+
|
|
21
|
+
# 删除方括号和圆括号中的内容(包括括号本身)
|
|
22
|
+
organ = re.sub(r"\[.*?\]", "", organ) # 非贪婪匹配方括号内容
|
|
23
|
+
organ = re.sub(r"\(.*?\)", "", organ) # 非贪婪匹配圆括号内容
|
|
24
|
+
|
|
25
|
+
# 定义正则表达式,匹配独立的6位数字
|
|
26
|
+
# \b 表示单词边界,确保6位数字是独立的(前后不是字母、数字或下划线)
|
|
27
|
+
organ = re.sub(r"\b[0-9]{6}\b", "", organ)
|
|
28
|
+
|
|
29
|
+
# 初始化结果列表,用于存储处理后的组织名称部分
|
|
30
|
+
format_organ = []
|
|
31
|
+
# 按分号分割字符串,生成组织名称的各个部分
|
|
32
|
+
organ_parts = organ.split(";")
|
|
33
|
+
|
|
34
|
+
# 遍历每个部分,追加到结果列表
|
|
35
|
+
for temp_organ in organ_parts:
|
|
36
|
+
# 去除首尾多余空格后追加(避免因移除邮编导致的空字符串)
|
|
37
|
+
cleaned_part = temp_organ.strip()
|
|
38
|
+
# 如果首尾是标点符号,则移除
|
|
39
|
+
# 定义标点符号的正则表达式(这里包括常见标点)
|
|
40
|
+
punctuation = r"^[!,.?;:#$%^&*+-]+|[!,.?;:#$%^&*+-]+$"
|
|
41
|
+
cleaned_part = re.sub(punctuation, "", cleaned_part)
|
|
42
|
+
if cleaned_part: # 只追加非空部分
|
|
43
|
+
format_organ.append(cleaned_part)
|
|
44
|
+
|
|
45
|
+
# 用分号连接结果,转换为大写并清理标点
|
|
46
|
+
format_organ = ";".join(format_organ)
|
|
47
|
+
|
|
48
|
+
# 返回最终结果并去除首尾空格
|
|
49
|
+
return format_organ.strip()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_first_organ(organ):
|
|
54
|
+
if not organ:
|
|
55
|
+
return ""
|
|
56
|
+
organ_list = organ.strip().split(";")
|
|
57
|
+
for organ_one in organ_list:
|
|
58
|
+
organ_one = clean_organ_postcode(organ_one)
|
|
59
|
+
if organ_one:
|
|
60
|
+
return organ_one
|
|
61
|
+
|
|
62
|
+
return ""
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_first_author(author: object) -> object:
|
|
66
|
+
if not author:
|
|
67
|
+
return ""
|
|
68
|
+
au_list = author.strip().split(";")
|
|
69
|
+
for au in au_list:
|
|
70
|
+
au = re.sub("\\[.*?]", "", au)
|
|
71
|
+
au = re.sub("\\(.*?\\)", "", au)
|
|
72
|
+
if au:
|
|
73
|
+
return au
|
|
74
|
+
return ""
|
|
File without changes
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
logger = logging.getLogger(__name__) # 创建 logger 实例
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BaseDict(object):
|
|
7
|
+
@classmethod
|
|
8
|
+
def flip_dict(cls, original_dict, raise_on_conflict=False):
|
|
9
|
+
"""
|
|
10
|
+
翻转字典:将 key 是字符串、value 是列表的字典,转换为 key 是原 value 列表中的元素、value 是原 key 的字典。
|
|
11
|
+
:param original_dict: 原始字典
|
|
12
|
+
:param raise_on_conflict: 是否在键冲突时抛出异常,默认为 False
|
|
13
|
+
:return: 翻转后的字典
|
|
14
|
+
"""
|
|
15
|
+
flipped_dict = {}
|
|
16
|
+
for key, value_list in original_dict.items():
|
|
17
|
+
for value in value_list:
|
|
18
|
+
if value in flipped_dict:
|
|
19
|
+
if raise_on_conflict:
|
|
20
|
+
raise ValueError(f"Key conflict detected: {value} already exists in the flipped dictionary.")
|
|
21
|
+
else:
|
|
22
|
+
# 覆盖冲突的键
|
|
23
|
+
logger.warning(
|
|
24
|
+
f"Warning: Key conflict detected for {value}. Overwriting with new value: {key}.")
|
|
25
|
+
flipped_dict[value] = key
|
|
26
|
+
return flipped_dict
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
|
|
4
|
+
from hdfs import InsecureClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class HDFSUtils(object):
|
|
8
|
+
"""
|
|
9
|
+
HDFS 工具类,封装常见的 HDFS 操作。
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, hdfs_url, hdfs_user):
|
|
13
|
+
"""
|
|
14
|
+
初始化 HDFS 客户端。
|
|
15
|
+
:param hdfs_url: HDFS 的 URL,例如 "http://namenode:50070"
|
|
16
|
+
:param hdfs_user: HDFS 用户名
|
|
17
|
+
"""
|
|
18
|
+
self.hdfs_url = hdfs_url
|
|
19
|
+
self.hdfs_user = hdfs_user
|
|
20
|
+
self.client = InsecureClient(hdfs_url, user=hdfs_user)
|
|
21
|
+
|
|
22
|
+
def upload_file(self, local_path, hdfs_path, overwrite=False):
|
|
23
|
+
"""
|
|
24
|
+
将本地文件上传到 HDFS。
|
|
25
|
+
:param local_path: 本地文件路径
|
|
26
|
+
:param hdfs_path: HDFS 文件路径
|
|
27
|
+
:param overwrite: 是否覆盖已存在的文件
|
|
28
|
+
:return: None
|
|
29
|
+
"""
|
|
30
|
+
self.client.upload(hdfs_path, local_path, overwrite=overwrite)
|
|
31
|
+
print(f"文件上传成功: {local_path} -> {hdfs_path}")
|
|
32
|
+
|
|
33
|
+
def download_file(self, hdfs_path, local_path, overwrite=False):
|
|
34
|
+
"""
|
|
35
|
+
从 HDFS 下载文件到本地。
|
|
36
|
+
:param hdfs_path: HDFS 文件路径
|
|
37
|
+
:param local_path: 本地文件路径
|
|
38
|
+
:param overwrite: 是否覆盖已存在的文件
|
|
39
|
+
:return: None
|
|
40
|
+
"""
|
|
41
|
+
self.client.download(hdfs_path, local_path, overwrite=overwrite)
|
|
42
|
+
print(f"文件下载成功: {hdfs_path} -> {local_path}")
|
|
43
|
+
|
|
44
|
+
def delete_file(self, hdfs_path, recursive=False):
|
|
45
|
+
"""
|
|
46
|
+
删除 HDFS 上的文件或目录。
|
|
47
|
+
:param hdfs_path: HDFS 文件或目录路径
|
|
48
|
+
:param recursive: 是否递归删除目录
|
|
49
|
+
:return: None
|
|
50
|
+
"""
|
|
51
|
+
self.client.delete(hdfs_path, recursive=recursive)
|
|
52
|
+
print(f"文件/目录删除成功: {hdfs_path}")
|
|
53
|
+
|
|
54
|
+
def create_directory(self, hdfs_path):
|
|
55
|
+
"""
|
|
56
|
+
在 HDFS 上创建目录。
|
|
57
|
+
:param hdfs_path: HDFS 目录路径
|
|
58
|
+
:return: None
|
|
59
|
+
"""
|
|
60
|
+
self.client.makedirs(hdfs_path)
|
|
61
|
+
print(f"目录创建成功: {hdfs_path}")
|
|
62
|
+
|
|
63
|
+
def list_files(self, hdfs_path):
|
|
64
|
+
"""
|
|
65
|
+
列出 HDFS 目录下的文件和子目录。
|
|
66
|
+
:param hdfs_path: HDFS 目录路径
|
|
67
|
+
:return: 文件/目录列表
|
|
68
|
+
"""
|
|
69
|
+
files = self.client.list(hdfs_path)
|
|
70
|
+
return files
|
|
71
|
+
|
|
72
|
+
def read_file(self, hdfs_path):
|
|
73
|
+
"""
|
|
74
|
+
读取 HDFS 文件内容。
|
|
75
|
+
:param hdfs_path: HDFS 文件路径
|
|
76
|
+
:return: 文件内容
|
|
77
|
+
"""
|
|
78
|
+
with self.client.read(hdfs_path) as reader:
|
|
79
|
+
content = reader.read()
|
|
80
|
+
print(f"文件读取成功: {hdfs_path}")
|
|
81
|
+
return content
|
|
82
|
+
|
|
83
|
+
def read_gz_file(self, hdfs_path, encoding='utf-8'):
|
|
84
|
+
"""
|
|
85
|
+
读取 HDFS 上的 .gz 文件内容。
|
|
86
|
+
:param hdfs_path: HDFS 文件路径(必须以 .gz 结尾)
|
|
87
|
+
:param encoding: 文件编码格式(默认 utf-8)
|
|
88
|
+
:return: 文件内容
|
|
89
|
+
"""
|
|
90
|
+
with self.client.read(hdfs_path) as reader: # 以二进制模式读取
|
|
91
|
+
compressed_data = reader.read() # 读取压缩数据
|
|
92
|
+
with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file: # 解压缩
|
|
93
|
+
content = gz_file.read().decode(encoding) # 解码为字符串
|
|
94
|
+
print(f"文件读取成功: {hdfs_path}")
|
|
95
|
+
return content
|
|
96
|
+
|
|
97
|
+
def write_file(self, hdfs_path, content, overwrite=False, encoding='utf-8'):
|
|
98
|
+
"""
|
|
99
|
+
向 HDFS 文件写入内容。
|
|
100
|
+
:param hdfs_path: HDFS 文件路径
|
|
101
|
+
:param content: 要写入的内容
|
|
102
|
+
:param overwrite: 是否覆盖已存在的文件
|
|
103
|
+
:param encoding: 文件编码格式
|
|
104
|
+
:return: None
|
|
105
|
+
"""
|
|
106
|
+
with self.client.write(hdfs_path, overwrite=overwrite, encoding=encoding) as writer:
|
|
107
|
+
writer.write(content)
|
|
108
|
+
print(f"文件写入成功: {hdfs_path}")
|
|
109
|
+
|
|
110
|
+
def file_exists(self, hdfs_path):
|
|
111
|
+
"""
|
|
112
|
+
检查 HDFS 文件或目录是否存在。
|
|
113
|
+
:param hdfs_path: HDFS 文件或目录路径
|
|
114
|
+
:return: 是否存在
|
|
115
|
+
"""
|
|
116
|
+
status = self.client.status(hdfs_path, strict=False)
|
|
117
|
+
return status is not None
|
|
118
|
+
|
|
119
|
+
def rename_file(self, hdfs_src_path, hdfs_dst_path):
|
|
120
|
+
"""
|
|
121
|
+
重命名或移动 HDFS 文件/目录。
|
|
122
|
+
:param hdfs_src_path: 源路径
|
|
123
|
+
:param hdfs_dst_path: 目标路径
|
|
124
|
+
:return: None
|
|
125
|
+
"""
|
|
126
|
+
self.client.rename(hdfs_src_path, hdfs_dst_path)
|
|
127
|
+
print(f"文件/目录重命名成功: {hdfs_src_path} -> {hdfs_dst_path}")
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BytesEncoder(json.JSONEncoder):
|
|
6
|
+
def default(self, obj):
|
|
7
|
+
if isinstance(obj, bytes):
|
|
8
|
+
return base64.b64encode(obj).decode('utf-8')
|
|
9
|
+
return super().default(obj)
|
|
10
|
+
|
|
11
|
+
# json.dumps(x, ensure_ascii=False, cls=BytesEncoder)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import regex
|
|
3
|
+
|
|
4
|
+
from re_common.v2.baselibrary.utils.stringutils import qj2bj, bj2qj, get_diacritic_variant
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class StringClear(object):
|
|
8
|
+
|
|
9
|
+
def __init__(self, obj_str):
|
|
10
|
+
self.obj_str = obj_str
|
|
11
|
+
|
|
12
|
+
def None_to_str(self):
|
|
13
|
+
if self.obj_str is None:
|
|
14
|
+
self.obj_str = ''
|
|
15
|
+
return self
|
|
16
|
+
|
|
17
|
+
def qj_to_bj(self):
|
|
18
|
+
# 全角变半角
|
|
19
|
+
self.obj_str = qj2bj(self.obj_str)
|
|
20
|
+
return self
|
|
21
|
+
|
|
22
|
+
def bj_to_qj(self):
|
|
23
|
+
# 半角变全角
|
|
24
|
+
self.obj_str = bj2qj(self.obj_str)
|
|
25
|
+
|
|
26
|
+
def lower(self):
|
|
27
|
+
self.obj_str = self.obj_str.lower()
|
|
28
|
+
return self
|
|
29
|
+
|
|
30
|
+
def upper(self):
|
|
31
|
+
self.obj_str = self.obj_str.upper()
|
|
32
|
+
return self
|
|
33
|
+
|
|
34
|
+
def collapse_spaces(self):
|
|
35
|
+
# 移除多余空格,连续多个空格变一个
|
|
36
|
+
self.obj_str = re.sub(r"\s+", " ", self.obj_str)
|
|
37
|
+
return self
|
|
38
|
+
|
|
39
|
+
def clear_all_spaces(self):
|
|
40
|
+
# 去除所有空格
|
|
41
|
+
self.obj_str = re.sub("\\s+", "", self.obj_str)
|
|
42
|
+
return self
|
|
43
|
+
|
|
44
|
+
def clean_symbols(self):
|
|
45
|
+
"""
|
|
46
|
+
清理已知的符号
|
|
47
|
+
"""
|
|
48
|
+
self.obj_str = regex.sub(
|
|
49
|
+
"[\\p{P}+~$`^=|<>~`$^+=|<>¥×\\\\*#$^|+%&~!,:.;'/{}()\\[\\]?<> 《》”“-()。≤《〈〉》—、·―–‐‘’“”″…¨〔〕°■『』℃ⅠⅡⅢⅣⅤⅥⅦⅩⅪⅫ]",
|
|
50
|
+
"",
|
|
51
|
+
self.obj_str) # \\p{P} 标点符号 后面的是一些其他符号, 也可以用 \p{S} 代替 但是这个很广 可能有误伤
|
|
52
|
+
return self
|
|
53
|
+
|
|
54
|
+
def remove_special_chars(self):
|
|
55
|
+
# 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
|
|
56
|
+
self.obj_str = re.sub(r"[^\w\s]", "", self.obj_str)
|
|
57
|
+
return self
|
|
58
|
+
|
|
59
|
+
def remove_underline(self):
|
|
60
|
+
# 下划线在 \w 中 所以这里独立封装
|
|
61
|
+
self.obj_str = re.sub("[_]", "", self.obj_str)
|
|
62
|
+
return self
|
|
63
|
+
|
|
64
|
+
def remove_diacritics(self):
|
|
65
|
+
# 去除音标 转换成字母
|
|
66
|
+
self.obj_str = get_diacritic_variant(self.obj_str)
|
|
67
|
+
|
|
68
|
+
def remove_brackets(self):
|
|
69
|
+
# 移除 方括号里面的内容
|
|
70
|
+
self.obj_str = re.sub("\\[.*?]", "", self.obj_str)
|
|
71
|
+
return self
|
|
72
|
+
|
|
73
|
+
def remove_parentheses(self):
|
|
74
|
+
# 移除圆括号的内容
|
|
75
|
+
self.obj_str = re.sub("\\(.*?\\)", "", self.obj_str)
|
|
76
|
+
return self
|
|
77
|
+
|
|
78
|
+
def get_str(self):
|
|
79
|
+
return self.obj_str
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def rel_clear(str_obj):
|
|
83
|
+
# 为融合数据定制的 清理规则
|
|
84
|
+
return StringClear(str_obj).qj_to_bj().remove_special_chars().collapse_spaces().lower().get_str()
|