re-common 10.0.23__py3-none-any.whl → 10.0.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +196 -0
- re_common/v2/baselibrary/business_utils/__init__.py +0 -0
- re_common/v2/baselibrary/business_utils/rel_tools.py +6 -0
- re_common/v2/baselibrary/tools/WeChatRobot.py +18 -2
- re_common/v2/baselibrary/tools/data_processer/__init__.py +0 -0
- re_common/v2/baselibrary/tools/data_processer/base.py +53 -0
- re_common/v2/baselibrary/tools/data_processer/data_processer.py +508 -0
- re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -0
- re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -0
- re_common/v2/baselibrary/tools/dict_tools.py +7 -0
- re_common/v2/baselibrary/tools/list_tools.py +5 -1
- re_common/v2/baselibrary/utils/api_net_utils.py +270 -0
- re_common/v2/baselibrary/utils/db.py +21 -2
- re_common/v2/baselibrary/utils/string_bool.py +59 -22
- re_common/v2/baselibrary/utils/string_clear.py +6 -1
- re_common/v2/baselibrary/utils/stringutils.py +15 -0
- {re_common-10.0.23.dist-info → re_common-10.0.25.dist-info}/METADATA +2 -10
- {re_common-10.0.23.dist-info → re_common-10.0.25.dist-info}/RECORD +21 -12
- {re_common-10.0.23.dist-info → re_common-10.0.25.dist-info}/WHEEL +1 -1
- {re_common-10.0.23.dist-info → re_common-10.0.25.dist-info}/LICENSE +0 -0
- {re_common-10.0.23.dist-info → re_common-10.0.25.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from hdfs import InsecureClient
|
|
6
|
+
|
|
7
|
+
from re_common.v2.baselibrary.tools.data_processer.base import BaseFileWriter
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class HDFSFileWriter(BaseFileWriter):
|
|
11
|
+
def __init__(self, file_path: str, hdfs_url: str, hdfs_user: str, *args, **kwargs):
|
|
12
|
+
super().__init__(file_path, *args, **kwargs)
|
|
13
|
+
self.client = InsecureClient(hdfs_url, user=hdfs_user)
|
|
14
|
+
|
|
15
|
+
def write_lines(self, lines: List[str], file_path: str = None):
|
|
16
|
+
if file_path is None:
|
|
17
|
+
file_path = self.file_path
|
|
18
|
+
data = "\n".join(lines).encode(self.encoding)
|
|
19
|
+
if self.compress:
|
|
20
|
+
buf = BytesIO()
|
|
21
|
+
with gzip.GzipFile(fileobj=buf, mode="wb") as gz:
|
|
22
|
+
gz.write(data)
|
|
23
|
+
buf.seek(0)
|
|
24
|
+
self.client.write(file_path, data=buf, overwrite=self.overwrite)
|
|
25
|
+
else:
|
|
26
|
+
self.client.write(file_path, data=data, overwrite=self.overwrite)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class LocalFileWriter(BaseFileWriter):
|
|
30
|
+
def write_lines(self, lines: List[str], file_path: str, compress: bool = True, encoding="utf-8"):
|
|
31
|
+
if compress:
|
|
32
|
+
with gzip.open(file_path, 'wt', encoding=encoding) as f:
|
|
33
|
+
for line in lines:
|
|
34
|
+
f.write(f"{line}\n")
|
|
35
|
+
else:
|
|
36
|
+
with open(file_path, 'w', encoding=encoding) as f:
|
|
37
|
+
for line in lines:
|
|
38
|
+
f.write(f"{line}\n")
|
|
@@ -63,4 +63,8 @@ def list_to_dict(list_data,key_name):
|
|
|
63
63
|
|
|
64
64
|
# 将 defaultdict 转换成普通字典
|
|
65
65
|
dict_data = dict(dict_data)
|
|
66
|
-
return dict_data
|
|
66
|
+
return dict_data
|
|
67
|
+
|
|
68
|
+
def split_list_by_step(lst, step=100):
|
|
69
|
+
# 一维列表按照步长转换成二维列表
|
|
70
|
+
return [lst[i:i + step] for i in range(0, len(lst), step)]
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
import sys
|
|
3
|
+
import asyncio
|
|
4
|
+
import aiohttp
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from tenacity import retry, stop_after_attempt, wait_random
|
|
8
|
+
|
|
9
|
+
g_headers = {
|
|
10
|
+
'accept': 'application/json',
|
|
11
|
+
'Content-Type': 'application/json',
|
|
12
|
+
'Authorization': 'Bearer eyJhbGciOiJub25lIiwidHlwIjoiSldUIn0.eyJ1c2VyX2lkIjotMSwidXNlcl9uYW1lIjoiXHU1ZTk0XHU3NTI4XHU0ZTJkXHU1ZmMzQ2xpZW50In0.'
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
cls._conn = aiohttp.TCPConnector(
|
|
17
|
+
limit=50, # 最大连接数
|
|
18
|
+
ssl=False, # 禁用SSL验证(按需开启)
|
|
19
|
+
force_close=True, # 保持连接活跃
|
|
20
|
+
enable_cleanup_closed=True # 自动清理关闭的连接
|
|
21
|
+
)
|
|
22
|
+
# 由于网络上有重名,没有连接。如果加入域,请转到“控制面板”中的“系统”更改计算机名,然后重试。如果加入工作组,请选择其他工作组名。
|
|
23
|
+
有可能是
|
|
24
|
+
force_close=True, # 保持连接活跃
|
|
25
|
+
enable_cleanup_closed=True # 自动清理关闭的连接
|
|
26
|
+
照成的
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class HttpError(Exception):
|
|
31
|
+
code = 0
|
|
32
|
+
message = ""
|
|
33
|
+
headers = None
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
*,
|
|
38
|
+
code: Optional[int] = None,
|
|
39
|
+
message: str = "",
|
|
40
|
+
headers: Optional[dict] = None,
|
|
41
|
+
) -> None:
|
|
42
|
+
if code is not None:
|
|
43
|
+
self.code = code
|
|
44
|
+
self.headers = headers
|
|
45
|
+
self.message = message
|
|
46
|
+
|
|
47
|
+
def __str__(self) -> str:
|
|
48
|
+
return f"code: {self.code}, message:{self.message}"
|
|
49
|
+
|
|
50
|
+
def __repr__(self) -> str:
|
|
51
|
+
return f"<{self.__class__.__name__}: code={self.code}, message={self.message!r}>"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def on_retry_error(retry_state):
|
|
55
|
+
# 最后抛错后调用
|
|
56
|
+
original_exc = retry_state.outcome.exception()
|
|
57
|
+
print(f"[HTTP 请求重试所有重试失败.] 错误消息{original_exc}")
|
|
58
|
+
|
|
59
|
+
raise HttpError(code=getattr(original_exc, 'code', 455),
|
|
60
|
+
message=f"错误消息:{str(original_exc)}") from original_exc
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def on_retry(retry_state):
|
|
64
|
+
# 每次抛错进入该函数打印消息
|
|
65
|
+
print(
|
|
66
|
+
f"[HTTP 请求重试]"
|
|
67
|
+
f"当前重试 : 第 {retry_state.attempt_number} 次"
|
|
68
|
+
f"睡眠时间 : {retry_state.next_action.sleep:.2f} 秒"
|
|
69
|
+
f"\n异常原因 : {retry_state.outcome.exception()}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class ApiNetUtils:
|
|
74
|
+
"""
|
|
75
|
+
HTTP请求工具类(异步版),提供GET/POST/PATCH请求方法
|
|
76
|
+
特性:
|
|
77
|
+
1. 自动复用TCP连接池
|
|
78
|
+
2. 自动重试机制(通过async_retry装饰器)
|
|
79
|
+
3. 进程退出时自动清理资源
|
|
80
|
+
4. 线程安全的延迟初始化
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
# 类属性使用Optional类型注解,初始化为None实现延迟初始化
|
|
84
|
+
_conn: Optional[aiohttp.TCPConnector] = None
|
|
85
|
+
_session: Optional[aiohttp.ClientSession] = None
|
|
86
|
+
_close_registered: bool = False # 确保清理函数只注册一次
|
|
87
|
+
|
|
88
|
+
@classmethod
|
|
89
|
+
async def _get_connector(cls) -> aiohttp.TCPConnector:
|
|
90
|
+
"""
|
|
91
|
+
获取TCP连接器(延迟初始化)
|
|
92
|
+
解决模块加载时没有事件循环的问题
|
|
93
|
+
"""
|
|
94
|
+
if cls._conn is None or cls._conn.closed or cls.is_loop_closed(cls._session):
|
|
95
|
+
# 只有在首次使用时才创建连接器
|
|
96
|
+
cls._conn = aiohttp.TCPConnector(
|
|
97
|
+
limit=50, # 最大连接数
|
|
98
|
+
ssl=False, # 禁用SSL验证(按需开启)
|
|
99
|
+
force_close=True, # 保持连接活跃
|
|
100
|
+
# enable_cleanup_closed=True, # 自动清理关闭的连接 #
|
|
101
|
+
# keepalive_timeout=4.99 # 比服务器的5s 小一点
|
|
102
|
+
)
|
|
103
|
+
return cls._conn
|
|
104
|
+
|
|
105
|
+
@classmethod
|
|
106
|
+
async def _get_session(cls) -> aiohttp.ClientSession:
|
|
107
|
+
"""
|
|
108
|
+
获取共享会话(线程安全的延迟初始化)
|
|
109
|
+
包含自动注册清理机制
|
|
110
|
+
"""
|
|
111
|
+
if cls._session is None or cls._session.closed or cls.is_loop_closed(cls._session):
|
|
112
|
+
if cls._session:
|
|
113
|
+
await cls.close()
|
|
114
|
+
# 获取连接器(会自动初始化)
|
|
115
|
+
connector = await cls._get_connector()
|
|
116
|
+
|
|
117
|
+
# 强制获取新的事件循环
|
|
118
|
+
loop = asyncio.get_event_loop()
|
|
119
|
+
|
|
120
|
+
# 创建新会话
|
|
121
|
+
cls._session = aiohttp.ClientSession(
|
|
122
|
+
connector=connector,
|
|
123
|
+
timeout=aiohttp.ClientTimeout(total=30), # 默认30秒超时
|
|
124
|
+
loop=loop) # 显式指定事件循环
|
|
125
|
+
|
|
126
|
+
# # 注册退出时的清理钩子
|
|
127
|
+
cls._register_cleanup()
|
|
128
|
+
|
|
129
|
+
return cls._session
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def is_loop_closed(session: aiohttp.ClientSession) -> bool:
|
|
133
|
+
"""
|
|
134
|
+
检查会话绑定的事件循环是否已关闭
|
|
135
|
+
"""
|
|
136
|
+
loop = session._loop # 获取会话绑定的事件循环
|
|
137
|
+
if loop.is_closed():
|
|
138
|
+
# print("Event loop is closed")
|
|
139
|
+
return True
|
|
140
|
+
# print("Event loop not is closed")
|
|
141
|
+
return False
|
|
142
|
+
|
|
143
|
+
@classmethod
|
|
144
|
+
def _register_cleanup(cls):
|
|
145
|
+
"""
|
|
146
|
+
注册进程退出时的资源清理函数
|
|
147
|
+
包含正常退出和异常退出两种情况
|
|
148
|
+
"""
|
|
149
|
+
if not cls._close_registered:
|
|
150
|
+
# 1. 正常退出处理
|
|
151
|
+
atexit.register(lambda: asyncio.run(cls.close()))
|
|
152
|
+
|
|
153
|
+
# 2. 异常退出处理
|
|
154
|
+
original_excepthook = sys.excepthook
|
|
155
|
+
|
|
156
|
+
def custom_excepthook(exctype, value, traceback):
|
|
157
|
+
"""自定义异常钩子,确保资源被清理"""
|
|
158
|
+
# 先执行原始异常处理(打印堆栈等)
|
|
159
|
+
original_excepthook(exctype, value, traceback)
|
|
160
|
+
# 然后执行资源清理
|
|
161
|
+
try:
|
|
162
|
+
asyncio.run(cls.close())
|
|
163
|
+
except RuntimeError:
|
|
164
|
+
# 如果已经没有事件循环,则同步执行
|
|
165
|
+
loop = asyncio.new_event_loop()
|
|
166
|
+
loop.run_until_complete(cls.close())
|
|
167
|
+
loop.close()
|
|
168
|
+
|
|
169
|
+
sys.excepthook = custom_excepthook
|
|
170
|
+
cls._close_registered = True
|
|
171
|
+
|
|
172
|
+
@classmethod
|
|
173
|
+
async def close(cls):
|
|
174
|
+
"""
|
|
175
|
+
安全关闭所有网络资源
|
|
176
|
+
会自动在程序退出时调用,也可手动调用
|
|
177
|
+
"""
|
|
178
|
+
if cls._session and not cls._session.closed:
|
|
179
|
+
await cls._session.close()
|
|
180
|
+
cls._session = None
|
|
181
|
+
|
|
182
|
+
if cls._conn and not cls._conn.closed:
|
|
183
|
+
await cls._conn.close()
|
|
184
|
+
cls._conn = None
|
|
185
|
+
|
|
186
|
+
# print("[ApiNetUtils] 网络资源已安全释放")
|
|
187
|
+
|
|
188
|
+
# -------------------- 公共API方法 -------------------- #
|
|
189
|
+
|
|
190
|
+
@classmethod
|
|
191
|
+
@retry(stop=stop_after_attempt(4), # 本质上执行4次 但重试3次
|
|
192
|
+
wait=wait_random(min=5, max=15),
|
|
193
|
+
before_sleep=on_retry, # 每次抛错后使用
|
|
194
|
+
retry_error_callback=on_retry_error,
|
|
195
|
+
reraise=True)
|
|
196
|
+
async def fetch_get(cls, url: str, headers=None, params=None):
|
|
197
|
+
"""
|
|
198
|
+
GET请求封装
|
|
199
|
+
:param url: 请求URL
|
|
200
|
+
:param headers: 可选请求头(默认使用全局g_headers)
|
|
201
|
+
:param params: 查询参数(字典)
|
|
202
|
+
:return: 解析后的JSON数据
|
|
203
|
+
:raises HttpError: 当状态码非200时抛出
|
|
204
|
+
"""
|
|
205
|
+
headers = headers or g_headers
|
|
206
|
+
session = await cls._get_session()
|
|
207
|
+
|
|
208
|
+
async with session.get(url, headers=headers, params=params) as response:
|
|
209
|
+
if response.status != 200:
|
|
210
|
+
error_text = await response.text()
|
|
211
|
+
raise HttpError(
|
|
212
|
+
code=response.status,
|
|
213
|
+
message=f"请求失败: url={url}, status={response.status}, 错误详情={error_text}"
|
|
214
|
+
)
|
|
215
|
+
return await response.json()
|
|
216
|
+
|
|
217
|
+
@classmethod
|
|
218
|
+
@retry(stop=stop_after_attempt(4),
|
|
219
|
+
wait=wait_random(min=5, max=15),
|
|
220
|
+
before_sleep=on_retry, # 每次抛错后使用
|
|
221
|
+
retry_error_callback=on_retry_error,
|
|
222
|
+
reraise=True)
|
|
223
|
+
async def fetch_post(cls, url: str, payload: dict, headers=None):
|
|
224
|
+
"""
|
|
225
|
+
POST请求封装(JSON格式)
|
|
226
|
+
"""
|
|
227
|
+
headers = headers or g_headers
|
|
228
|
+
session = await cls._get_session()
|
|
229
|
+
|
|
230
|
+
async with session.post(url, json=payload, headers=headers) as response:
|
|
231
|
+
if response.status != 200:
|
|
232
|
+
error_text = await response.text()
|
|
233
|
+
raise HttpError(
|
|
234
|
+
code=response.status,
|
|
235
|
+
message=f"请求失败: url={url}, status={response.status}, 错误详情={error_text}"
|
|
236
|
+
)
|
|
237
|
+
return await response.json()
|
|
238
|
+
|
|
239
|
+
@classmethod
|
|
240
|
+
@retry(stop=stop_after_attempt(4),
|
|
241
|
+
wait=wait_random(min=5, max=15),
|
|
242
|
+
before_sleep=on_retry, # 每次抛错后使用
|
|
243
|
+
retry_error_callback=on_retry_error,
|
|
244
|
+
reraise=True)
|
|
245
|
+
async def fetch_patch(cls, url: str, payload: dict, headers=None):
|
|
246
|
+
"""
|
|
247
|
+
PATCH请求封装(JSON格式)
|
|
248
|
+
"""
|
|
249
|
+
headers = headers or g_headers
|
|
250
|
+
session = await cls._get_session()
|
|
251
|
+
|
|
252
|
+
async with session.patch(url, json=payload, headers=headers) as response:
|
|
253
|
+
if response.status != 200:
|
|
254
|
+
error_text = await response.text()
|
|
255
|
+
raise HttpError(
|
|
256
|
+
code=response.status,
|
|
257
|
+
message=f"请求失败: url={url}, status={response.status}, 错误详情={error_text}"
|
|
258
|
+
)
|
|
259
|
+
return await response.json()
|
|
260
|
+
|
|
261
|
+
@classmethod
|
|
262
|
+
async def __aenter__(cls):
|
|
263
|
+
"""支持async with语法"""
|
|
264
|
+
await cls._get_session()
|
|
265
|
+
return cls
|
|
266
|
+
|
|
267
|
+
@classmethod
|
|
268
|
+
async def __aexit__(cls, exc_type, exc, tb):
|
|
269
|
+
"""async with退出时自动关闭"""
|
|
270
|
+
await cls.close()
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
import os
|
|
1
3
|
from contextlib import asynccontextmanager
|
|
2
4
|
from typing import AsyncGenerator, Tuple
|
|
3
5
|
|
|
4
|
-
import
|
|
5
|
-
from aiomysql import Pool, Connection, Cursor, DictCursor
|
|
6
|
+
from aiomysql import Pool, Connection, Cursor
|
|
6
7
|
|
|
7
8
|
DB_CONFIG = {
|
|
8
9
|
'host': '192.168.98.55',
|
|
@@ -36,3 +37,21 @@ async def get_session(pool: Pool) -> AsyncGenerator[Tuple[Connection, Cursor], N
|
|
|
36
37
|
async with pool.acquire() as conn:
|
|
37
38
|
async with conn.cursor() as cursor:
|
|
38
39
|
yield conn, cursor
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# main.py
|
|
43
|
+
import aiomysql
|
|
44
|
+
import asyncio
|
|
45
|
+
|
|
46
|
+
aiomysql_pool = None
|
|
47
|
+
pool_lock = asyncio.Lock() # 全局异步锁
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
async def init_aiomysql_pool_async():
|
|
51
|
+
global aiomysql_pool
|
|
52
|
+
if aiomysql_pool is None:
|
|
53
|
+
async with pool_lock:
|
|
54
|
+
if aiomysql_pool is None:
|
|
55
|
+
print(f"[{os.getpid()}] Initializing aiomysql pool...")
|
|
56
|
+
aiomysql_pool = await aiomysql.create_pool(**DB_CONFIG)
|
|
57
|
+
return aiomysql_pool
|
|
@@ -3,6 +3,12 @@ import re
|
|
|
3
3
|
import unicodedata
|
|
4
4
|
|
|
5
5
|
|
|
6
|
+
def is_ascii_alnum(char: str) -> bool:
|
|
7
|
+
# 精准判断是否为英文或数字字符(ASCII 范围)
|
|
8
|
+
# char.isalnum() 字母或数字(Unicode)包含中文等非英文字符
|
|
9
|
+
return char.isascii() and char.isalnum()
|
|
10
|
+
|
|
11
|
+
|
|
6
12
|
def is_all_english_chars(s):
|
|
7
13
|
return bool(re.match(r'^[A-Za-z]+$', s))
|
|
8
14
|
|
|
@@ -104,46 +110,77 @@ def is_all_symbols(text):
|
|
|
104
110
|
|
|
105
111
|
def is_whole_word_en(sub_str: str, long_str: str) -> bool:
|
|
106
112
|
"""
|
|
107
|
-
|
|
113
|
+
判断 sub_str 是否作为 long_str 中的一个完整英文单词(不被其他单词嵌套)。
|
|
108
114
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
115
|
+
参数:
|
|
116
|
+
sub_str: 要搜索的英文子串
|
|
117
|
+
long_str: 被搜索的字符串
|
|
112
118
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
119
|
+
返回:
|
|
120
|
+
True 表示 sub_str 是一个完整单词;False 表示是部分单词或不匹配。
|
|
121
|
+
"""
|
|
122
|
+
# 用于 忽略大小写 进行匹配
|
|
123
|
+
regex_pattern = re.compile(r"[^a-z0-9]", re.IGNORECASE) # 用于判断非字母数字字符
|
|
117
124
|
|
|
118
125
|
if not sub_str or not long_str:
|
|
119
126
|
return False
|
|
120
127
|
|
|
121
|
-
#
|
|
122
|
-
if long_str
|
|
128
|
+
# 检查整段是否完全等于 sub_str
|
|
129
|
+
if long_str == sub_str:
|
|
123
130
|
return True
|
|
124
131
|
|
|
125
|
-
#
|
|
126
|
-
index =
|
|
127
|
-
|
|
132
|
+
# 遍历所有 sub_str 的出现位置
|
|
133
|
+
index = 0
|
|
134
|
+
while index < len(long_str):
|
|
135
|
+
# 从字符串 long_str 的第 index 个位置开始,查找子串 sub_str 第一次出现的位置,并把它赋值给 index。
|
|
136
|
+
index = long_str.find(sub_str, index)
|
|
137
|
+
if index == -1:
|
|
138
|
+
break
|
|
139
|
+
|
|
140
|
+
# 检查 sub_str 前一个字符(如果有)是否为非字母数字
|
|
128
141
|
if index == 0:
|
|
129
142
|
is_start = True
|
|
130
143
|
else:
|
|
131
144
|
is_start = bool(regex_pattern.match(long_str[index - 1]))
|
|
132
145
|
|
|
133
|
-
|
|
146
|
+
# 检查 sub_str 后一个字符(如果有)是否为非字母数字
|
|
147
|
+
end_index = index + len(sub_str)
|
|
148
|
+
if end_index == len(long_str):
|
|
134
149
|
is_end = True
|
|
135
150
|
else:
|
|
136
|
-
is_end = bool(regex_pattern.match(long_str[
|
|
151
|
+
is_end = bool(regex_pattern.match(long_str[end_index]))
|
|
137
152
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
153
|
+
if is_start and is_end:
|
|
154
|
+
return True
|
|
155
|
+
|
|
156
|
+
# 移动索引继续查找
|
|
157
|
+
index += 1
|
|
158
|
+
|
|
159
|
+
return False
|
|
141
160
|
|
|
142
161
|
|
|
143
162
|
def is_whole_word(sub_str: str, long_str: str) -> bool:
|
|
163
|
+
"""
|
|
164
|
+
判断 sub_str 是否为 long_str 中的一个完整词(适配中英文)。
|
|
165
|
+
中文采用“包含”判断,英文采用完整词匹配。
|
|
166
|
+
|
|
167
|
+
参数:
|
|
168
|
+
sub_str: 要搜索的子串(中英文均可)
|
|
169
|
+
long_str: 被搜索的字符串
|
|
170
|
+
|
|
171
|
+
返回:
|
|
172
|
+
True 表示 sub_str 是一个完整词;False 否则。
|
|
173
|
+
"""
|
|
144
174
|
if contains_chinese_chars(sub_str):
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
175
|
+
# 子字符串完全包含在长字符串
|
|
176
|
+
is_contain = sub_str in long_str
|
|
177
|
+
# 是否是字母数字
|
|
178
|
+
if is_ascii_alnum(sub_str[0]) or is_ascii_alnum(sub_str[-1]):
|
|
179
|
+
# 表示中英文混合 看是否是截断单词即可
|
|
180
|
+
return is_whole_word_en(sub_str, long_str)
|
|
181
|
+
else:
|
|
182
|
+
# 中文子串只要被包含即可视为“完整词”
|
|
183
|
+
return is_contain
|
|
148
184
|
else:
|
|
149
|
-
|
|
185
|
+
# 英文使用完整单词判断逻辑
|
|
186
|
+
return is_whole_word_en(sub_str, long_str)
|
|
@@ -9,7 +9,7 @@ from re_common.v2.baselibrary.utils.stringutils import (
|
|
|
9
9
|
bj2qj,
|
|
10
10
|
get_diacritic_variant,
|
|
11
11
|
clean_html,
|
|
12
|
-
remove_spaces_between_chinese_characters,
|
|
12
|
+
remove_spaces_between_chinese_characters, clean_unicode_alnum,
|
|
13
13
|
)
|
|
14
14
|
|
|
15
15
|
|
|
@@ -91,6 +91,11 @@ class StringClear(object):
|
|
|
91
91
|
self.obj_str = re.sub(r"[^\w\s]", "", self.obj_str)
|
|
92
92
|
return self
|
|
93
93
|
|
|
94
|
+
def remove_all_symbols(self):
|
|
95
|
+
# 一种更加强力的符号清理 只保留各个国家的字符 和各个国家的数字
|
|
96
|
+
self.obj_str = clean_unicode_alnum(self.obj_str)
|
|
97
|
+
return self
|
|
98
|
+
|
|
94
99
|
def remove_underline(self):
|
|
95
100
|
# 下划线在 \w 中 所以这里独立封装
|
|
96
101
|
self.obj_str = re.sub("[_]", "", self.obj_str)
|
|
@@ -211,3 +211,18 @@ def get_group_abstract(lists):
|
|
|
211
211
|
t_list.append(keyid_list[text_idx])
|
|
212
212
|
all_list.append(t_list)
|
|
213
213
|
return all_list
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def clean_unicode_alnum(text: str) -> str:
|
|
217
|
+
"""
|
|
218
|
+
清除所有非 Unicode 字母或数字的字符。
|
|
219
|
+
|
|
220
|
+
参数:
|
|
221
|
+
text (str): 输入文本。
|
|
222
|
+
|
|
223
|
+
返回:
|
|
224
|
+
str: 只包含 Unicode 字母和数字的文本。
|
|
225
|
+
\p{N} 匹配所有 Unicode 数字字符 包括非阿拉伯数字字符
|
|
226
|
+
\p{L} 匹配所有语言字符
|
|
227
|
+
"""
|
|
228
|
+
return regex.sub(r"[^\p{L}\p{N}]+", "", text)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: re_common
|
|
3
|
-
Version: 10.0.
|
|
3
|
+
Version: 10.0.25
|
|
4
4
|
Summary: a library about all python projects
|
|
5
5
|
Home-page: https://gitee.com/xujiangios/re-common
|
|
6
6
|
Author: vic
|
|
@@ -11,14 +11,6 @@ Classifier: Operating System :: OS Independent
|
|
|
11
11
|
Requires-Python: >=3.6
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
|
-
Dynamic: author
|
|
15
|
-
Dynamic: author-email
|
|
16
|
-
Dynamic: classifier
|
|
17
|
-
Dynamic: description
|
|
18
|
-
Dynamic: description-content-type
|
|
19
|
-
Dynamic: home-page
|
|
20
|
-
Dynamic: requires-python
|
|
21
|
-
Dynamic: summary
|
|
22
14
|
|
|
23
15
|
|
|
24
16
|
这是一个基础类,依赖很多的第三方包,是一个用得到的第三方库的封装,可以在此基础上迅速构建项目
|
|
@@ -163,37 +163,46 @@ re_common/studio/streamlitstudio/first_app.py,sha256=t7Fw8YDlub7G9q99GgVo_3sPZXU
|
|
|
163
163
|
re_common/studio/streamlitstudio/uber_pickups.py,sha256=cvrV5e8vRBM2_CpVDBE-f3V4mGFK9SqpRPZK8TEqr6U,785
|
|
164
164
|
re_common/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
165
165
|
re_common/v2/baselibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
166
|
+
re_common/v2/baselibrary/business_utils/BusinessStringUtil.py,sha256=njPcRgeBWpnZr5u2cPAO4qdWBq-CgTn99rJuvWFcChk,6788
|
|
167
|
+
re_common/v2/baselibrary/business_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
168
|
+
re_common/v2/baselibrary/business_utils/rel_tools.py,sha256=LfnGFCkUSxg1SHvOMOQdP1PiHxIKqk7Syuk5YYpjJag,295
|
|
166
169
|
re_common/v2/baselibrary/decorators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
167
170
|
re_common/v2/baselibrary/decorators/utils.py,sha256=Q4D6KKCQxvNBXZkPQQn14keKKJpGtg8TUSakjJU40s0,2056
|
|
168
171
|
re_common/v2/baselibrary/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
169
172
|
re_common/v2/baselibrary/s3object/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
170
173
|
re_common/v2/baselibrary/s3object/baseboto3.py,sha256=mXuIFx99pnrPGQ4LJCZwlN1HLbaU-OWLwck0cVzW6hc,11203
|
|
171
|
-
re_common/v2/baselibrary/tools/WeChatRobot.py,sha256=
|
|
174
|
+
re_common/v2/baselibrary/tools/WeChatRobot.py,sha256=sKBt2gPsfj0gzV6KaLSAhIhL-j3qNfHfqE-lII1LVwM,3537
|
|
172
175
|
re_common/v2/baselibrary/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
173
176
|
re_common/v2/baselibrary/tools/ac_ahocorasick.py,sha256=c63y5RtKVLD37nyPCnBqfNygwRj4gTQqyIdDOrC65G0,2847
|
|
174
|
-
re_common/v2/baselibrary/tools/dict_tools.py,sha256=
|
|
177
|
+
re_common/v2/baselibrary/tools/dict_tools.py,sha256=eSMwPTLp3oSjuviC_wlXg0I-dnkkmZfUfCRLX5djWV8,1365
|
|
175
178
|
re_common/v2/baselibrary/tools/dolphinscheduler.py,sha256=1m7UGYDiuvJUCI6ik6CGM2fO8U5XteJzn55VRbwB9ts,7978
|
|
176
179
|
re_common/v2/baselibrary/tools/hdfs_data_processer.py,sha256=g0DaNjXM1hIUblFQ6YBwnwEBKIXn48X8Y9Eiok4dVlQ,14824
|
|
177
|
-
re_common/v2/baselibrary/tools/list_tools.py,sha256=
|
|
180
|
+
re_common/v2/baselibrary/tools/list_tools.py,sha256=1NxGVM4EytSXh4IGAEfZQnvq0Ev-UOF-PGZBg2EQbOg,2132
|
|
178
181
|
re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=2ENLtZE8opRsfkwRtTNMzITmpTsjO7wZ1ZkfkqpOH9U,1937
|
|
179
182
|
re_common/v2/baselibrary/tools/text_matcher.py,sha256=cPMoFxaA0-ce3tLRxVSs8_3pTYS1oVIHDnNy_AlPU-4,10756
|
|
180
183
|
re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
|
|
184
|
+
re_common/v2/baselibrary/tools/data_processer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
185
|
+
re_common/v2/baselibrary/tools/data_processer/base.py,sha256=i6HA2UQsSRZaKxW1wJMpiC9LAy3wYaI2BVxUAiFoRZ4,1704
|
|
186
|
+
re_common/v2/baselibrary/tools/data_processer/data_processer.py,sha256=R7zHQG8eo3mfckYr-Pp53fyyQj6zd8fuweSxwzvDgN0,22683
|
|
187
|
+
re_common/v2/baselibrary/tools/data_processer/data_reader.py,sha256=LWLbom7W2L0T6q38crA1_Gcvxkzk9Lm0btJjrmtMHMU,7945
|
|
188
|
+
re_common/v2/baselibrary/tools/data_processer/data_writer.py,sha256=OgKZ06zRJYNx758rbjxZG_KNgkLuVLlyB1AvyRsJtS4,1447
|
|
181
189
|
re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=njPcRgeBWpnZr5u2cPAO4qdWBq-CgTn99rJuvWFcChk,6788
|
|
182
190
|
re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
191
|
+
re_common/v2/baselibrary/utils/api_net_utils.py,sha256=22q3WMWiKVg1IVGr4y2D1JrjhnbQtlChRDJm2S8rGlc,9868
|
|
183
192
|
re_common/v2/baselibrary/utils/author_smi.py,sha256=1ebH3AHv19jtJWdlqNdwu6t58HNVLCotuCB6ff1SWiw,13666
|
|
184
193
|
re_common/v2/baselibrary/utils/base_string_similarity.py,sha256=a40a79ttwoG_gC_hxMNB-sMXXecgICoRDWrj0DW8iEE,7749
|
|
185
194
|
re_common/v2/baselibrary/utils/basedict.py,sha256=sH3_RZ8u4649-jX2V1uKNNkjJVUijZBDp6SdqncOZ88,1583
|
|
186
195
|
re_common/v2/baselibrary/utils/basehdfs.py,sha256=TPwFct_-UrmO1KCbo4gpV77rsnlCQDumNBbQKL0ZI9o,5953
|
|
187
196
|
re_common/v2/baselibrary/utils/basepika.py,sha256=ifOb3UsGj79k40aD9UK6-5BMPw43ZAo0SO3AYD4q4vw,7332
|
|
188
197
|
re_common/v2/baselibrary/utils/basetime.py,sha256=b7U_ho6nE3fjYBxSkdMHXUOd3ClH6KkW_7p7l2Gs4gA,3038
|
|
189
|
-
re_common/v2/baselibrary/utils/db.py,sha256=
|
|
198
|
+
re_common/v2/baselibrary/utils/db.py,sha256=0aVNth5g5KizrxHrXtp1b8Na5oZvof2hL_XVqBYNi6Q,1672
|
|
190
199
|
re_common/v2/baselibrary/utils/json_cls.py,sha256=M93piYtmgm_wP8E57culTrd_AhHLoGg6PqeAJYdW2SM,438
|
|
191
200
|
re_common/v2/baselibrary/utils/mq.py,sha256=UHpO8iNIHs91Tgp-BgnSUpZwjWquxrGLdpr3FMMv2zw,2858
|
|
192
201
|
re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
|
|
193
|
-
re_common/v2/baselibrary/utils/string_bool.py,sha256=
|
|
194
|
-
re_common/v2/baselibrary/utils/string_clear.py,sha256=
|
|
202
|
+
re_common/v2/baselibrary/utils/string_bool.py,sha256=vxnjSFOfuHWGxkqaIbUNn21opx5tfV1uCXSahFfp1mU,6197
|
|
203
|
+
re_common/v2/baselibrary/utils/string_clear.py,sha256=ywYR1KrKQyeM-zJgvTmORlfgbLdRSjWWKPe7K8oRx_8,7450
|
|
195
204
|
re_common/v2/baselibrary/utils/string_smi.py,sha256=cU0WAWHRGnGoVQx3eCEKeM_q_olFNzRTJe7rSe586SY,741
|
|
196
|
-
re_common/v2/baselibrary/utils/stringutils.py,sha256=
|
|
205
|
+
re_common/v2/baselibrary/utils/stringutils.py,sha256=TI6fw3km1l25ufXrnG6ha8dSBDtRh-MF4nWRt9u8Xbo,6452
|
|
197
206
|
re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
198
207
|
re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
|
|
199
208
|
re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
|
|
@@ -220,8 +229,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
|
|
|
220
229
|
re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
|
|
221
230
|
re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
|
|
222
231
|
re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
|
|
223
|
-
re_common-10.0.
|
|
224
|
-
re_common-10.0.
|
|
225
|
-
re_common-10.0.
|
|
226
|
-
re_common-10.0.
|
|
227
|
-
re_common-10.0.
|
|
232
|
+
re_common-10.0.25.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
233
|
+
re_common-10.0.25.dist-info/METADATA,sha256=wicJDzxXN8liL_MMiv2eoq5RvkYlsl9wxDulF-BNTJI,582
|
|
234
|
+
re_common-10.0.25.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
235
|
+
re_common-10.0.25.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
|
|
236
|
+
re_common-10.0.25.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|