mdbq 3.10.10__py3-none-any.whl → 3.10.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/deduplicator.py +177 -53
- mdbq/mysql/uploader.py +263 -327
- {mdbq-3.10.10.dist-info → mdbq-3.10.12.dist-info}/METADATA +1 -1
- {mdbq-3.10.10.dist-info → mdbq-3.10.12.dist-info}/RECORD +7 -8
- mdbq/aggregation/optimize.py +0 -475
- {mdbq-3.10.10.dist-info → mdbq-3.10.12.dist-info}/WHEEL +0 -0
- {mdbq-3.10.10.dist-info → mdbq-3.10.12.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,6 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=Y4Z1946i0wgaIJnF5wgkzAlfu4oaGyq8dcx7yqnubVM,19
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/optimize.py,sha256=zC_w_aVYXwmvfF0Z8iSGMmv5vptF0rP-Dz5zmp0gXTU,19820
|
5
4
|
mdbq/aggregation/query_data.py,sha256=fdotW8qdAyDB13p7r3p6AGBkavcHnf6hIvSMtcS7vqE,179875
|
6
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
7
6
|
mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
|
@@ -9,10 +8,10 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
|
9
8
|
mdbq/log/mylogger.py,sha256=07sstIeaIQUJXwpMwmxppRI7kW7QwZFnv4Rr3UDlyUs,24133
|
10
9
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
11
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
12
|
-
mdbq/mysql/deduplicator.py,sha256=
|
11
|
+
mdbq/mysql/deduplicator.py,sha256=ibmxpzenhPgT_ei61TjQB2ZxYs9ztkG_ygbLSa8RIlM,32990
|
13
12
|
mdbq/mysql/mysql.py,sha256=Lfy9PsEdgmdRtcG_UUgegH3bFTJPhByTWkcAYl8G6m0,56788
|
14
13
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
15
|
-
mdbq/mysql/uploader.py,sha256=
|
14
|
+
mdbq/mysql/uploader.py,sha256=CyNrNTGBvxwfG5NygG1tnsgVTqfKw_U5BewFZvObhJ0,66485
|
16
15
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
17
16
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
18
17
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -25,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
25
24
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
26
25
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
27
26
|
mdbq/spider/aikucun.py,sha256=YyPWa_nOH1zs8wgTDcgzn5w8szGKWPyWzmWMVIPkFnU,21638
|
28
|
-
mdbq-3.10.
|
29
|
-
mdbq-3.10.
|
30
|
-
mdbq-3.10.
|
31
|
-
mdbq-3.10.
|
27
|
+
mdbq-3.10.12.dist-info/METADATA,sha256=J0qQ7CRWH5l_7L2Qyu7YzfMLIib-53e9keSzthB6AZ8,365
|
28
|
+
mdbq-3.10.12.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-3.10.12.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-3.10.12.dist-info/RECORD,,
|
mdbq/aggregation/optimize.py
DELETED
@@ -1,475 +0,0 @@
|
|
1
|
-
# -*- coding:utf-8 -*-
|
2
|
-
import pymysql
|
3
|
-
import logging
|
4
|
-
from typing import List, Optional, Dict
|
5
|
-
import time
|
6
|
-
import re
|
7
|
-
import os
|
8
|
-
import hashlib
|
9
|
-
from dbutils.pooled_db import PooledDB
|
10
|
-
from mdbq.log import spider_logging
|
11
|
-
from mdbq.config import config
|
12
|
-
import threading
|
13
|
-
import queue
|
14
|
-
|
15
|
-
dir_path = os.path.expanduser("~")
|
16
|
-
config_file = os.path.join(dir_path, 'spd.txt')
|
17
|
-
my_cont = config.read_config(config_file)
|
18
|
-
username, password, port = my_cont['username'], my_cont['password'], my_cont['port']
|
19
|
-
host = '127.0.0.1'
|
20
|
-
logger = spider_logging.setup_logging(reMoveOldHandler=True, filename='optimize.log')
|
21
|
-
|
22
|
-
|
23
|
-
class MySQLDeduplicator:
|
24
|
-
|
25
|
-
def __init__(self, host: str, username: str, password: str, port: int = 3306):
|
26
|
-
self.pool = PooledDB(
|
27
|
-
creator=pymysql,
|
28
|
-
maxconnections=10, # 最大连接数
|
29
|
-
mincached=2, # 初始化空闲连接数
|
30
|
-
maxcached=5, # 空闲连接最大缓存数
|
31
|
-
blocking=True,
|
32
|
-
host=host,
|
33
|
-
port=int(port),
|
34
|
-
user=username,
|
35
|
-
password=password,
|
36
|
-
ping=1,
|
37
|
-
charset='utf8mb4',
|
38
|
-
cursorclass=pymysql.cursors.DictCursor
|
39
|
-
)
|
40
|
-
self.set_typ = {
|
41
|
-
'日期': 'date',
|
42
|
-
'更新时间': 'timestamp',
|
43
|
-
}
|
44
|
-
self.tables_to_reset = queue.Queue() # 线程安全队列
|
45
|
-
self.delay_time = 120 # 延迟重置自增 id
|
46
|
-
self.lock = threading.Lock() # 用于关键操作同步
|
47
|
-
|
48
|
-
def get_table_in_databases(self, db_list=None, reset_id=False):
|
49
|
-
"""
|
50
|
-
reset_id: 是否重置自增 id
|
51
|
-
"""
|
52
|
-
if not db_list:
|
53
|
-
return
|
54
|
-
connection = self.get_connection()
|
55
|
-
res = []
|
56
|
-
for db_name in db_list:
|
57
|
-
try:
|
58
|
-
with connection.cursor() as cursor:
|
59
|
-
cursor.execute(f"USE `{db_name}`")
|
60
|
-
cursor.execute("SHOW TABLES")
|
61
|
-
tables = cursor.fetchall()
|
62
|
-
for index, item in enumerate(tables):
|
63
|
-
res.append(
|
64
|
-
{
|
65
|
-
'db_name': db_name,
|
66
|
-
'table_name': item.get(f'Tables_in_{db_name}', ''),
|
67
|
-
'reset_id': reset_id,
|
68
|
-
}
|
69
|
-
)
|
70
|
-
except:
|
71
|
-
pass
|
72
|
-
connection.close()
|
73
|
-
return res
|
74
|
-
|
75
|
-
def deduplicate(
|
76
|
-
self,
|
77
|
-
tables_list: List[Dict],
|
78
|
-
order_column: str = "更新时间",
|
79
|
-
order_direction: str = "DESC",
|
80
|
-
batch_size: int = 10000,
|
81
|
-
id_column: str = "id",
|
82
|
-
recent_months: Optional[int] = None
|
83
|
-
) -> bool:
|
84
|
-
"""
|
85
|
-
执行多表去重操作
|
86
|
-
:param tables_list: 目标表配置列表,每个元素为字典,包含db_name, table_name, unique_keys(可选), reset_id(可选)
|
87
|
-
:param order_column: 排序字段
|
88
|
-
:param order_direction: 排序方向 (ASC/DESC)
|
89
|
-
:param batch_size: 批量删除批次大小
|
90
|
-
:param id_column: 自增列名称
|
91
|
-
:return: 是否全部成功
|
92
|
-
"""
|
93
|
-
if recent_months is not None and (not isinstance(recent_months, int) or recent_months < 1):
|
94
|
-
logger.error("recent_months必须为None或正整数")
|
95
|
-
return False
|
96
|
-
for table_config in tables_list:
|
97
|
-
config = {
|
98
|
-
'order_column': order_column,
|
99
|
-
'order_direction': order_direction,
|
100
|
-
'batch_size': batch_size,
|
101
|
-
'id_column': id_column,
|
102
|
-
'reset_id': table_config.get('reset_id', False), # 处理默认值
|
103
|
-
'unique_keys': table_config.get('unique_keys', None),
|
104
|
-
'recent_months': recent_months,
|
105
|
-
}
|
106
|
-
config.update(table_config)
|
107
|
-
self._deduplicate_single_table(**config)
|
108
|
-
|
109
|
-
def _deduplicate_single_table(
|
110
|
-
self,
|
111
|
-
db_name: str,
|
112
|
-
table_name: str,
|
113
|
-
unique_keys: Optional[List[str]],
|
114
|
-
order_column: str,
|
115
|
-
order_direction: str,
|
116
|
-
batch_size: int,
|
117
|
-
reset_id: bool,
|
118
|
-
id_column: str,
|
119
|
-
recent_months: Optional[int] = None
|
120
|
-
):
|
121
|
-
"""单表去重逻辑"""
|
122
|
-
|
123
|
-
# 获取数据库连接并检查有效性
|
124
|
-
connection = self.get_connection(db_name=db_name)
|
125
|
-
if not connection:
|
126
|
-
logger.error(f"连接数据库失败: {db_name}")
|
127
|
-
return False
|
128
|
-
|
129
|
-
temp_suffix = hashlib.md5(f"{table_name}{time.time()}".encode()).hexdigest()[:8]
|
130
|
-
temp_table = f"temp_{temp_suffix}"
|
131
|
-
|
132
|
-
try:
|
133
|
-
# 版本检查在check_db内部
|
134
|
-
if not self.check_db(db_name, table_name):
|
135
|
-
return False
|
136
|
-
|
137
|
-
with connection.cursor() as cursor:
|
138
|
-
# 主键重复检查
|
139
|
-
try:
|
140
|
-
cursor.execute(f"""
|
141
|
-
SELECT COUNT(*) AS total,
|
142
|
-
COUNT(DISTINCT `{id_column}`) AS distinct_count
|
143
|
-
FROM `{table_name}`
|
144
|
-
""")
|
145
|
-
except pymysql.err.InternalError as e:
|
146
|
-
if e.args[0] == pymysql.constants.ER.BAD_FIELD_ERROR:
|
147
|
-
logger.warning(f"{db_name}/{table_name} 跳过主键检查(无{id_column}列)")
|
148
|
-
else:
|
149
|
-
raise
|
150
|
-
else:
|
151
|
-
res = cursor.fetchone()
|
152
|
-
if res['total'] != res['distinct_count']:
|
153
|
-
logger.error(f"{db_name}/{table_name} 主键重复: {id_column}")
|
154
|
-
return False
|
155
|
-
|
156
|
-
all_columns = self._get_table_columns(db_name, table_name)
|
157
|
-
# 自动生成unique_keys逻辑
|
158
|
-
if not unique_keys:
|
159
|
-
exclude_set = {id_column.lower(), order_column.lower()}
|
160
|
-
|
161
|
-
if not all_columns:
|
162
|
-
logger.error(f"{db_name}/{table_name} 无法获取表列信息")
|
163
|
-
return False
|
164
|
-
|
165
|
-
# 排除id_column和order_column
|
166
|
-
unique_keys = [
|
167
|
-
col for col in all_columns
|
168
|
-
if col.lower() not in exclude_set
|
169
|
-
and col != id_column # 额外确保大小写兼容
|
170
|
-
and col != order_column
|
171
|
-
]
|
172
|
-
# 检查剩余列是否有效
|
173
|
-
if not unique_keys:
|
174
|
-
unique_keys = all_columns
|
175
|
-
logger.warning(f"{db_name}/{table_name} 使用全列作为唯一键: {all_columns}")
|
176
|
-
return False
|
177
|
-
# logger.info(f"自动生成unique_keys: {unique_keys}")
|
178
|
-
else:
|
179
|
-
if not self._validate_columns(db_name, table_name, unique_keys):
|
180
|
-
logger.error(f"{db_name}/{table_name} unique_keys中存在无效列名")
|
181
|
-
return False
|
182
|
-
|
183
|
-
# 动态生成临时表名
|
184
|
-
partition_clause = ', '.join([f'`{col}`' for col in unique_keys])
|
185
|
-
|
186
|
-
# 使用参数化查询创建临时表
|
187
|
-
if self._validate_columns(db_name, table_name, [order_column]):
|
188
|
-
order_clause = f"ORDER BY `{order_column}` {order_direction}" if order_column else ""
|
189
|
-
else:
|
190
|
-
order_clause = ''
|
191
|
-
|
192
|
-
# 时间过滤
|
193
|
-
where_clause = ""
|
194
|
-
query_params = []
|
195
|
-
date_column_exists = '日期' in all_columns
|
196
|
-
if recent_months and recent_months > 0 and date_column_exists:
|
197
|
-
where_clause = "WHERE `日期` >= DATE_SUB(CURDATE(), INTERVAL %s MONTH)"
|
198
|
-
query_params.append(recent_months)
|
199
|
-
elif recent_months and not date_column_exists:
|
200
|
-
logger.warning(f"{db_name}/{table_name} 忽略recent_months参数(无日期列)")
|
201
|
-
|
202
|
-
create_temp_sql = f"""
|
203
|
-
CREATE TEMPORARY TABLE `{temp_table}` AS
|
204
|
-
SELECT tmp_id FROM (
|
205
|
-
SELECT `{id_column}` AS tmp_id,
|
206
|
-
ROW_NUMBER() OVER (
|
207
|
-
PARTITION BY {partition_clause or '1'}
|
208
|
-
{order_clause}
|
209
|
-
) AS row_num
|
210
|
-
FROM `{table_name}`
|
211
|
-
{where_clause}
|
212
|
-
) t WHERE row_num > 1;
|
213
|
-
"""
|
214
|
-
cursor.execute(create_temp_sql, query_params)
|
215
|
-
|
216
|
-
logger.info(f'{db_name}/{table_name} 执行排重任务')
|
217
|
-
# 批量删除优化
|
218
|
-
iteration = 0
|
219
|
-
total_deleted = 0
|
220
|
-
while True and iteration < 10000:
|
221
|
-
iteration += 1
|
222
|
-
# 获取并删除临时表中的数据,避免重复处理
|
223
|
-
cursor.execute(f"""
|
224
|
-
SELECT tmp_id
|
225
|
-
FROM `{temp_table}`
|
226
|
-
LIMIT %s
|
227
|
-
FOR UPDATE;
|
228
|
-
""", (batch_size,))
|
229
|
-
batch = cursor.fetchall()
|
230
|
-
if not batch:
|
231
|
-
break
|
232
|
-
ids = [str(row['tmp_id']) for row in batch]
|
233
|
-
placeholder = ','.join(['%s'] * len(ids))
|
234
|
-
|
235
|
-
if ids:
|
236
|
-
try:
|
237
|
-
# 删除主表数据
|
238
|
-
cursor.execute(f"DELETE FROM `{table_name}` WHERE `{id_column}` IN ({placeholder})", ids)
|
239
|
-
|
240
|
-
# 删除临时表中已处理的记录
|
241
|
-
cursor.execute(f"DELETE FROM `{temp_table}` WHERE tmp_id IN ({placeholder})", ids)
|
242
|
-
except pymysql.err.InternalError as e:
|
243
|
-
if e.args[0] == pymysql.constants.ER.BAD_FIELD_ERROR:
|
244
|
-
logger.error(f"{db_name}/{table_name} 无法通过 {id_column} 删除记录,请检查列存在性")
|
245
|
-
return False
|
246
|
-
raise
|
247
|
-
|
248
|
-
total_deleted += cursor.rowcount
|
249
|
-
connection.commit()
|
250
|
-
logger.info(f"{db_name}/{table_name} 执行去重, 删除记录数: {total_deleted}")
|
251
|
-
|
252
|
-
if total_deleted > 0:
|
253
|
-
logger.info(f"{db_name}/{table_name} 删除记录数总计: {total_deleted}")
|
254
|
-
|
255
|
-
# 线程安全操作队列
|
256
|
-
if reset_id:
|
257
|
-
if not self._validate_columns(db_name, table_name, [id_column]):
|
258
|
-
return True
|
259
|
-
|
260
|
-
with self.lock:
|
261
|
-
self.tables_to_reset.put((db_name, table_name, id_column))
|
262
|
-
logger.info(f"{db_name}/{table_name} -> {self.delay_time}秒后重置自增id")
|
263
|
-
threading.Timer(self.delay_time, self.delayed_reset_auto_increment).start()
|
264
|
-
|
265
|
-
return True
|
266
|
-
except Exception as e:
|
267
|
-
logger.error(f"{db_name}/{table_name} 去重操作异常: {e}", exc_info=True)
|
268
|
-
connection.rollback()
|
269
|
-
return False
|
270
|
-
finally:
|
271
|
-
with connection.cursor() as cursor:
|
272
|
-
cursor.execute(f"DROP TEMPORARY TABLE IF EXISTS `{temp_table}`")
|
273
|
-
connection.close()
|
274
|
-
|
275
|
-
def _get_table_columns(self, db_name: str, table_name: str) -> List[str]:
|
276
|
-
"""获取表的列"""
|
277
|
-
try:
|
278
|
-
connection = self.get_connection(db_name=db_name)
|
279
|
-
with connection.cursor() as cursor:
|
280
|
-
cursor.execute(f"SHOW COLUMNS FROM `{table_name}`")
|
281
|
-
return [row["Field"] for row in cursor.fetchall()]
|
282
|
-
except pymysql.Error as e:
|
283
|
-
logging.error(f"{db_name}/{table_name} 获取列失败: {e}")
|
284
|
-
return []
|
285
|
-
|
286
|
-
def check_db(self, db_name: str, table_name: str) -> bool:
|
287
|
-
"""数据库检查"""
|
288
|
-
try:
|
289
|
-
with self.get_connection() as conn:
|
290
|
-
with conn.cursor() as cursor:
|
291
|
-
# 获取MySQL版本
|
292
|
-
version = self._check_mysql_version(cursor)
|
293
|
-
collation = 'utf8mb4_0900_ai_ci' if version >= 8.0 else 'utf8mb4_general_ci'
|
294
|
-
|
295
|
-
# 创建数据库
|
296
|
-
cursor.execute(f"""
|
297
|
-
CREATE DATABASE IF NOT EXISTS `{db_name}`
|
298
|
-
CHARACTER SET utf8mb4 COLLATE {collation}
|
299
|
-
""")
|
300
|
-
conn.commit()
|
301
|
-
|
302
|
-
# 切换数据库
|
303
|
-
cursor.execute(f"USE `{db_name}`")
|
304
|
-
|
305
|
-
# 检查表是否存在
|
306
|
-
if not self._table_exists(cursor, table_name):
|
307
|
-
self._create_table(cursor, table_name)
|
308
|
-
conn.commit()
|
309
|
-
return True
|
310
|
-
except Exception as e:
|
311
|
-
logger.error(f"{db_name}/{table_name} 数据库检查失败: {e}")
|
312
|
-
return False
|
313
|
-
|
314
|
-
def get_connection(self, db_name=None):
|
315
|
-
"""从连接池获取连接"""
|
316
|
-
for _ in range(10):
|
317
|
-
try:
|
318
|
-
if db_name:
|
319
|
-
connection = self.pool.connection()
|
320
|
-
with connection.cursor() as cursor:
|
321
|
-
cursor.execute(f'use {db_name};')
|
322
|
-
return connection
|
323
|
-
|
324
|
-
return self.pool.connection()
|
325
|
-
except pymysql.Error as e:
|
326
|
-
logger.error(f"{db_name} 获取连接失败: {e}, 30秒后重试...")
|
327
|
-
time.sleep(30)
|
328
|
-
logger.error(f"{host}: {port} 数据库连接失败,已达最大重试次数")
|
329
|
-
return None
|
330
|
-
|
331
|
-
def _validate_identifier(self, name: str) -> bool:
|
332
|
-
"""更严格的对象名验证(符合MySQL规范)"""
|
333
|
-
return re.match(r'^[\w$]+$', name) and len(name) <= 64
|
334
|
-
|
335
|
-
def _validate_columns(self, db_name: str, table_name: str, columns: List[str]) -> bool:
|
336
|
-
"""验证列是否存在"""
|
337
|
-
if not all(self._validate_identifier(col) for col in columns):
|
338
|
-
return False
|
339
|
-
try:
|
340
|
-
connection = self.get_connection(db_name=db_name)
|
341
|
-
with connection.cursor() as cursor:
|
342
|
-
cursor.execute(f"SHOW COLUMNS FROM `{table_name}`")
|
343
|
-
existing_columns = {col['Field'] for col in cursor.fetchall()}
|
344
|
-
return all(col in existing_columns for col in columns)
|
345
|
-
except pymysql.Error as e:
|
346
|
-
logging.error(f"{db_name}/{table_name} 列验证失败: {e}")
|
347
|
-
return False
|
348
|
-
|
349
|
-
def _check_mysql_version(self, cursor) -> float:
|
350
|
-
"""通过传入游标检查版本"""
|
351
|
-
cursor.execute("SELECT VERSION()")
|
352
|
-
return float(cursor.fetchone()['VERSION()'][:3])
|
353
|
-
|
354
|
-
def _table_exists(self, cursor, table_name: str) -> bool:
|
355
|
-
cursor.execute("SHOW TABLES LIKE %s", (table_name,))
|
356
|
-
return cursor.fetchone() is not None
|
357
|
-
|
358
|
-
def _create_table(self, cursor, table_name: str):
|
359
|
-
"""安全建表逻辑"""
|
360
|
-
columns = ["`id` INT AUTO_INCREMENT PRIMARY KEY"]
|
361
|
-
for cn, ct in self.set_typ.items():
|
362
|
-
col_def = f"`{cn}` {ct.upper()} NOT NULL DEFAULT "
|
363
|
-
if 'INT' in ct:
|
364
|
-
col_def += '0'
|
365
|
-
elif 'TIMESTAMP' in ct:
|
366
|
-
col_def += 'CURRENT_TIMESTAMP'
|
367
|
-
else:
|
368
|
-
col_def += "''"
|
369
|
-
columns.append(col_def)
|
370
|
-
cursor.execute(f"""
|
371
|
-
CREATE TABLE `{table_name}` (
|
372
|
-
{', '.join(columns)}
|
373
|
-
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
|
374
|
-
""")
|
375
|
-
|
376
|
-
def delayed_reset_auto_increment(self):
|
377
|
-
"""线程安全的自增ID重置"""
|
378
|
-
while not self.tables_to_reset.empty():
|
379
|
-
try:
|
380
|
-
item = self.tables_to_reset.get_nowait()
|
381
|
-
self._safe_reset_auto_increment(*item)
|
382
|
-
except queue.Empty:
|
383
|
-
break
|
384
|
-
|
385
|
-
def _safe_reset_auto_increment(self, db_name: str, table_name: str, id_column: str):
|
386
|
-
"""安全重置自增ID"""
|
387
|
-
with self.get_connection(db_name) as conn:
|
388
|
-
try:
|
389
|
-
with conn.cursor() as cursor:
|
390
|
-
cursor.execute("START TRANSACTION")
|
391
|
-
temp_table = f"reset_{hashlib.md5(table_name.encode()).hexdigest()[:8]}"
|
392
|
-
backup_table = f"{table_name}_backup_{int(time.time())}"
|
393
|
-
cursor.execute(f"CREATE TABLE `{temp_table}` LIKE `{table_name}`")
|
394
|
-
cursor.execute(f"ALTER TABLE `{temp_table}` MODIFY COLUMN `{id_column}` INT NOT NULL")
|
395
|
-
columns = self._get_table_columns(db_name, table_name)
|
396
|
-
if id_column not in columns:
|
397
|
-
logger.error(f"列 {id_column} 不存在于表 {table_name}")
|
398
|
-
return False
|
399
|
-
columns.remove(id_column)
|
400
|
-
columns_str = ', '.join([f'`{col}`' for col in columns])
|
401
|
-
insert_sql = f"""
|
402
|
-
INSERT INTO `{temp_table}` (`{id_column}`, {columns_str})
|
403
|
-
SELECT ROW_NUMBER() OVER (ORDER BY `{id_column}`), {columns_str}
|
404
|
-
FROM `{table_name}` ORDER BY `{id_column}`
|
405
|
-
"""
|
406
|
-
cursor.execute(insert_sql)
|
407
|
-
cursor.execute(f"RENAME TABLE `{table_name}` TO `{backup_table}`, `{temp_table}` TO `{table_name}`")
|
408
|
-
cursor.execute(f"ALTER TABLE `{table_name}` MODIFY COLUMN `{id_column}` INT AUTO_INCREMENT")
|
409
|
-
cursor.execute(f"SELECT MAX(`{id_column}`) + 1 AS next_id FROM `{table_name}`")
|
410
|
-
next_id = cursor.fetchone()['next_id'] or 1
|
411
|
-
cursor.execute(f"ALTER TABLE `{table_name}` AUTO_INCREMENT = {next_id}")
|
412
|
-
cursor.execute(f"DROP TABLE IF EXISTS `{backup_table}`")
|
413
|
-
cursor.execute(f"DROP TEMPORARY TABLE IF EXISTS `{temp_table}`")
|
414
|
-
cursor.execute("COMMIT")
|
415
|
-
logger.info(f'{db_name}/{table_name} 已重置自增id')
|
416
|
-
except Exception as e:
|
417
|
-
logger.error(f"{db_name}/{table_name} 重置自增id失败: {e}")
|
418
|
-
cursor.execute("ROLLBACK")
|
419
|
-
return False
|
420
|
-
finally:
|
421
|
-
conn.close()
|
422
|
-
|
423
|
-
|
424
|
-
def main():
|
425
|
-
op = MySQLDeduplicator(
|
426
|
-
host=host,
|
427
|
-
username=username,
|
428
|
-
password=password,
|
429
|
-
port=port
|
430
|
-
)
|
431
|
-
op.delay_time = 600
|
432
|
-
# tables_list = [
|
433
|
-
# {
|
434
|
-
# 'db_name': "测试库",
|
435
|
-
# 'table_name': "测试库2",
|
436
|
-
# 'reset_id': True, # 可选, 默认 False
|
437
|
-
# # 'unique_keys': ["日期", "店铺名称", "商品id"]
|
438
|
-
# }
|
439
|
-
# ]
|
440
|
-
db_list = [
|
441
|
-
"京东数据3",
|
442
|
-
"属性设置3",
|
443
|
-
"推广数据2",
|
444
|
-
"推广数据_圣积天猫店",
|
445
|
-
"推广数据_淘宝店",
|
446
|
-
"推广数据_奥莱店",
|
447
|
-
"爱库存2",
|
448
|
-
"生意参谋3",
|
449
|
-
"生意经3",
|
450
|
-
"达摩盘3",
|
451
|
-
'人群画像2',
|
452
|
-
'商品人群画像2',
|
453
|
-
'市场数据3',
|
454
|
-
# '数据银行2'
|
455
|
-
# '回传数据',
|
456
|
-
# '大模型库',
|
457
|
-
'安全组',
|
458
|
-
# '视频数据',
|
459
|
-
# '聚合数据',
|
460
|
-
'数据引擎2'
|
461
|
-
]
|
462
|
-
tables_list = op.get_table_in_databases(db_list=db_list, reset_id=False)
|
463
|
-
op.deduplicate(
|
464
|
-
order_column = "更新时间",
|
465
|
-
order_direction = "DESC",
|
466
|
-
batch_size = 1000,
|
467
|
-
id_column = "id",
|
468
|
-
tables_list=tables_list,
|
469
|
-
recent_months=3,
|
470
|
-
)
|
471
|
-
logger.info(f'全部任务完成')
|
472
|
-
|
473
|
-
|
474
|
-
if __name__ == "__main__":
|
475
|
-
main()
|
File without changes
|
File without changes
|