mdbq 3.10.8__py3-none-any.whl → 3.10.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/aggregation/optimize.py +1 -0
- mdbq/aggregation/query_data.py +2 -0
- mdbq/mysql/deduplicator.py +171 -157
- mdbq/mysql/mysql.py +336 -280
- mdbq/mysql/s_query.py +159 -143
- mdbq/redis/getredis.py +0 -2
- {mdbq-3.10.8.dist-info → mdbq-3.10.10.dist-info}/METADATA +1 -1
- {mdbq-3.10.8.dist-info → mdbq-3.10.10.dist-info}/RECORD +11 -11
- {mdbq-3.10.8.dist-info → mdbq-3.10.10.dist-info}/WHEEL +0 -0
- {mdbq-3.10.8.dist-info → mdbq-3.10.10.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.10.
|
1
|
+
VERSION = '3.10.10'
|
mdbq/aggregation/optimize.py
CHANGED
mdbq/aggregation/query_data.py
CHANGED
@@ -3995,6 +3995,7 @@ def main(days=150, months=3):
|
|
3995
3995
|
"推广数据2",
|
3996
3996
|
"推广数据_淘宝店",
|
3997
3997
|
"推广数据_奥莱店",
|
3998
|
+
"推广数据_圣积天猫店",
|
3998
3999
|
"爱库存2",
|
3999
4000
|
"生意参谋3",
|
4000
4001
|
"生意经3",
|
@@ -4003,6 +4004,7 @@ def main(days=150, months=3):
|
|
4003
4004
|
'商品人群画像2',
|
4004
4005
|
'市场数据3',
|
4005
4006
|
'回传数据',
|
4007
|
+
'数据引擎2',
|
4006
4008
|
]
|
4007
4009
|
# 使用 ThreadPoolExecutor 来并行运行
|
4008
4010
|
# with concurrent.futures.ThreadPoolExecutor() as executor:
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
# -*- coding:utf-8 -*-
|
2
|
-
import datetime
|
3
2
|
import re
|
4
3
|
import time
|
5
4
|
from functools import wraps
|
@@ -7,11 +6,12 @@ import warnings
|
|
7
6
|
import pymysql
|
8
7
|
import os
|
9
8
|
from mdbq.log import mylogger
|
10
|
-
from typing import List, Dict, Optional, Any, Tuple
|
9
|
+
from typing import List, Dict, Optional, Any, Tuple
|
11
10
|
from dbutils.pooled_db import PooledDB
|
12
11
|
import threading
|
13
12
|
import concurrent.futures
|
14
13
|
from collections import defaultdict
|
14
|
+
import sys
|
15
15
|
|
16
16
|
|
17
17
|
warnings.filterwarnings('ignore')
|
@@ -24,7 +24,7 @@ logger = mylogger.MyLogger(
|
|
24
24
|
max_log_size=50,
|
25
25
|
backup_count=5,
|
26
26
|
enable_async=False, # 是否启用异步日志
|
27
|
-
sample_rate=1, # 采样
|
27
|
+
sample_rate=1, # 采样DEBUG/INFO日志, 0.5表示50%的日志会被采样
|
28
28
|
sensitive_fields=[], # 敏感字段列表
|
29
29
|
)
|
30
30
|
|
@@ -72,26 +72,28 @@ class MySQLDeduplicator:
|
|
72
72
|
skip_system_dbs: bool = True,
|
73
73
|
max_retries: int = 3,
|
74
74
|
retry_interval: int = 5,
|
75
|
-
pool_size: int = 5
|
76
|
-
|
75
|
+
pool_size: int = 5,
|
76
|
+
primary_key: str = 'id',
|
77
|
+
date_range: Optional[List[str]] = None,
|
78
|
+
recent_month: Optional[int] = None,
|
79
|
+
date_column: str = '日期',
|
80
|
+
exclude_columns: Optional[List[str]] = None
|
81
|
+
) -> None:
|
77
82
|
"""
|
78
83
|
初始化去重处理器
|
79
|
-
|
80
|
-
:param
|
81
|
-
:param
|
82
|
-
:param
|
83
|
-
:param
|
84
|
-
:param charset: 字符集,默认为utf8mb4
|
85
|
-
:param max_workers: 最大工作线程数,默认为1(单线程)
|
86
|
-
:param batch_size: 批量处理大小,默认为1000
|
87
|
-
:param skip_system_dbs: 是否跳过系统数据库,默认为True
|
88
|
-
:param max_retries: 最大重试次数
|
89
|
-
:param retry_interval: 重试间隔(秒)
|
90
|
-
:param pool_size: 连接池大小
|
84
|
+
新增参数:
|
85
|
+
:param date_range: 指定去重的日期区间 [start_date, end_date],格式'YYYY-MM-DD'
|
86
|
+
:param recent_month: 最近N个月的数据去重(与date_range互斥,优先生效)
|
87
|
+
:param date_column: 时间列名,默认为'日期'
|
88
|
+
:param exclude_columns: 去重时排除的列名列表,默认为['id', '更新时间']
|
91
89
|
"""
|
92
90
|
# 连接池状态标志
|
93
91
|
self._closed = False
|
94
|
-
|
92
|
+
logger.debug('初始化MySQLDeduplicator', {
|
93
|
+
'host': host, 'port': port, 'user': username, 'charset': charset,
|
94
|
+
'max_workers': max_workers, 'batch_size': batch_size, 'pool_size': pool_size,
|
95
|
+
'exclude_columns': exclude_columns
|
96
|
+
})
|
95
97
|
# 初始化连接池
|
96
98
|
self.pool = PooledDB(
|
97
99
|
creator=pymysql,
|
@@ -110,6 +112,34 @@ class MySQLDeduplicator:
|
|
110
112
|
self.skip_system_dbs = skip_system_dbs
|
111
113
|
self.max_retries = max_retries
|
112
114
|
self.retry_interval = retry_interval
|
115
|
+
self.primary_key = primary_key
|
116
|
+
|
117
|
+
# 时间范围参数
|
118
|
+
self.date_range = date_range
|
119
|
+
self.recent_month = recent_month
|
120
|
+
self.date_column = date_column
|
121
|
+
self._dedup_start_date = None
|
122
|
+
self._dedup_end_date = None
|
123
|
+
# 不管 exclude_columns 是否传入, 'id' 一定会被排除
|
124
|
+
default_exclude = {'id'}
|
125
|
+
# exclude_columns 不传则排除: ['id', '更新时间']
|
126
|
+
if not exclude_columns:
|
127
|
+
self.exclude_columns = list(default_exclude | {'更新时间'})
|
128
|
+
else:
|
129
|
+
self.exclude_columns = list(set(exclude_columns) | default_exclude)
|
130
|
+
# 解析时间范围
|
131
|
+
if self.date_range and len(self.date_range) == 2:
|
132
|
+
self._dedup_start_date, self._dedup_end_date = self.date_range
|
133
|
+
elif self.recent_month:
|
134
|
+
from datetime import datetime, timedelta
|
135
|
+
today = datetime.today()
|
136
|
+
month = today.month - self.recent_month
|
137
|
+
year = today.year
|
138
|
+
while month <= 0:
|
139
|
+
month += 12
|
140
|
+
year -= 1
|
141
|
+
self._dedup_start_date = f"{year}-{month:02d}-01"
|
142
|
+
self._dedup_end_date = today.strftime("%Y-%m-%d")
|
113
143
|
|
114
144
|
# 线程安全控制
|
115
145
|
self._lock = threading.Lock()
|
@@ -118,27 +148,28 @@ class MySQLDeduplicator:
|
|
118
148
|
# 系统数据库列表
|
119
149
|
self.SYSTEM_DATABASES = {'information_schema', 'mysql', 'performance_schema', 'sys'}
|
120
150
|
|
121
|
-
def _get_connection(self):
|
151
|
+
def _get_connection(self) -> pymysql.connections.Connection:
|
122
152
|
"""从连接池获取连接"""
|
123
153
|
if self._closed:
|
154
|
+
logger.error('尝试获取连接但连接池已关闭')
|
124
155
|
raise ConnectionError("连接池已关闭")
|
125
156
|
try:
|
126
157
|
conn = self.pool.connection()
|
127
158
|
logger.debug("成功获取数据库连接")
|
128
159
|
return conn
|
129
160
|
except Exception as e:
|
130
|
-
logger.error(f"获取数据库连接失败: {str(e)}")
|
161
|
+
logger.error(f"获取数据库连接失败: {str(e)}", {'error_type': type(e).__name__})
|
131
162
|
raise ConnectionError(f"连接数据库失败: {str(e)}")
|
132
163
|
|
133
164
|
@staticmethod
|
134
|
-
def _retry_on_failure(func):
|
165
|
+
def _retry_on_failure(func: Any) -> Any:
|
135
166
|
"""重试装饰器"""
|
136
|
-
|
137
167
|
@wraps(func)
|
138
168
|
def wrapper(self, *args, **kwargs):
|
139
169
|
last_exception = None
|
140
170
|
for attempt in range(self.max_retries + 1):
|
141
171
|
try:
|
172
|
+
logger.debug(f'调用{func.__name__},第{attempt+1}次尝试', {'args': args, 'kwargs': kwargs})
|
142
173
|
return func(self, *args, **kwargs)
|
143
174
|
except (pymysql.OperationalError, pymysql.InterfaceError) as e:
|
144
175
|
last_exception = e
|
@@ -146,18 +177,17 @@ class MySQLDeduplicator:
|
|
146
177
|
wait_time = self.retry_interval * (attempt + 1)
|
147
178
|
logger.warning(
|
148
179
|
f"数据库操作失败,准备重试 (尝试 {attempt + 1}/{self.max_retries})",
|
149
|
-
{'error': str(e), 'wait_time': wait_time})
|
180
|
+
{'error': str(e), 'wait_time': wait_time, 'func': func.__name__})
|
150
181
|
time.sleep(wait_time)
|
151
182
|
continue
|
152
183
|
except Exception as e:
|
153
184
|
last_exception = e
|
154
|
-
logger.error(f"操作失败: {str(e)}", {'error_type': type(e).__name__})
|
185
|
+
logger.error(f"操作失败: {str(e)}", {'error_type': type(e).__name__, 'func': func.__name__})
|
155
186
|
break
|
156
|
-
|
157
187
|
if last_exception:
|
188
|
+
logger.error('重试后依然失败', {'func': func.__name__, 'last_exception': str(last_exception)})
|
158
189
|
raise last_exception
|
159
190
|
raise Exception("未知错误")
|
160
|
-
|
161
191
|
return wrapper
|
162
192
|
|
163
193
|
@_retry_on_failure
|
@@ -187,7 +217,7 @@ class MySQLDeduplicator:
|
|
187
217
|
|
188
218
|
@_retry_on_failure
|
189
219
|
def _get_table_columns(self, database: str, table: str) -> List[str]:
|
190
|
-
"""获取表的列名(
|
220
|
+
"""获取表的列名(排除主键列)"""
|
191
221
|
sql = """
|
192
222
|
SELECT COLUMN_NAME
|
193
223
|
FROM INFORMATION_SCHEMA.COLUMNS
|
@@ -199,7 +229,7 @@ class MySQLDeduplicator:
|
|
199
229
|
with conn.cursor() as cursor:
|
200
230
|
cursor.execute(sql, (database, table))
|
201
231
|
return [row['COLUMN_NAME'] for row in cursor.fetchall()
|
202
|
-
if row['COLUMN_NAME'].lower() !=
|
232
|
+
if row['COLUMN_NAME'].lower() != self.primary_key.lower()]
|
203
233
|
|
204
234
|
def _acquire_table_lock(self, database: str, table: str) -> bool:
|
205
235
|
"""获取表处理锁,防止并发处理同一张表"""
|
@@ -212,7 +242,7 @@ class MySQLDeduplicator:
|
|
212
242
|
self._processing_tables.add(key)
|
213
243
|
return True
|
214
244
|
|
215
|
-
def _release_table_lock(self, database: str, table: str):
|
245
|
+
def _release_table_lock(self, database: str, table: str) -> None:
|
216
246
|
"""释放表处理锁"""
|
217
247
|
key = f"{database}.{table}"
|
218
248
|
|
@@ -238,100 +268,111 @@ class MySQLDeduplicator:
|
|
238
268
|
"""
|
239
269
|
if not self._acquire_table_lock(database, table):
|
240
270
|
return (0, 0)
|
241
|
-
|
271
|
+
temp_table = None
|
242
272
|
try:
|
243
|
-
|
244
|
-
|
273
|
+
# 获取原始数据总量
|
274
|
+
with self._get_connection() as conn:
|
275
|
+
with conn.cursor() as cursor:
|
276
|
+
logger.debug('执行SQL', {'sql': f'SELECT COUNT(*) as cnt FROM `{database}`.`{table}`'})
|
277
|
+
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`")
|
278
|
+
total_count_row = cursor.fetchone()
|
279
|
+
total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
|
280
|
+
logger.info('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name})
|
245
281
|
# 获取实际列名
|
246
282
|
all_columns = self._get_table_columns(database, table)
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
283
|
+
logger.debug('获取表列', {'库': database, '表': table, 'all_columns': all_columns})
|
284
|
+
# 检查是否需要按时间范围过滤
|
285
|
+
use_time_filter = False
|
286
|
+
time_col = self.date_column
|
287
|
+
all_columns_lower = [col.lower() for col in all_columns]
|
288
|
+
# 排除exclude_columns
|
289
|
+
exclude_columns_lower = [col.lower() for col in getattr(self, 'exclude_columns', [])]
|
290
|
+
# 统一列名小写做判断
|
252
291
|
use_columns = columns or all_columns
|
253
|
-
|
254
|
-
|
292
|
+
use_columns = [col for col in use_columns if col.lower() in all_columns_lower and col.lower() not in exclude_columns_lower]
|
293
|
+
invalid_columns = set([col for col in (columns or []) if col.lower() not in all_columns_lower])
|
255
294
|
if invalid_columns:
|
256
|
-
logger.warning(
|
257
|
-
f"表 {database}.{table} 中不存在以下列: {invalid_columns},使用有效列",
|
258
|
-
{'invalid_columns': invalid_columns}
|
259
|
-
)
|
260
|
-
use_columns = [col for col in use_columns if col in all_columns]
|
261
|
-
|
295
|
+
logger.warning('不存在的列', {"库": database, "表": table, "不存在以下列": invalid_columns, 'func': sys._getframe().f_code.co_name})
|
262
296
|
if not use_columns:
|
263
|
-
logger.error(
|
297
|
+
logger.error('没有有效的去重列', {"库": database, "表": table})
|
264
298
|
return (0, 0)
|
265
|
-
|
266
|
-
# 构建去重SQL
|
299
|
+
# 统一用反引号包裹
|
267
300
|
column_list = ', '.join([f'`{col}`' for col in use_columns])
|
268
|
-
|
269
|
-
temp_table =
|
270
|
-
|
271
|
-
|
272
|
-
|
301
|
+
temp_table = f"temp_{table}_dedup_{os.getpid()}_{threading.get_ident()}"
|
302
|
+
temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)[:60]
|
303
|
+
pk = self.primary_key
|
304
|
+
# 主键判断也用小写
|
305
|
+
if pk.lower() not in all_columns_lower and pk != 'id':
|
306
|
+
logger.error('', {"不存在主键列": database, "表": table, "主键列不存在": pk})
|
307
|
+
return (0, 0)
|
308
|
+
# 找到实际主键名
|
309
|
+
pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
|
310
|
+
# 构造where条件
|
311
|
+
where_time = ''
|
312
|
+
if use_time_filter:
|
313
|
+
where_time = f"WHERE `{time_col}` >= '{self._dedup_start_date}' AND `{time_col}` <= '{self._dedup_end_date}'"
|
273
314
|
create_temp_sql = f"""
|
274
315
|
CREATE TABLE `{database}`.`{temp_table}` AS
|
275
|
-
SELECT MIN(`
|
316
|
+
SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
|
276
317
|
FROM `{database}`.`{table}`
|
318
|
+
{where_time}
|
277
319
|
GROUP BY {column_list}
|
278
320
|
HAVING COUNT(*) > 1
|
279
321
|
"""
|
280
|
-
|
281
|
-
delete_dup_sql = f"""
|
282
|
-
DELETE FROM `{database}`.`{table}`
|
283
|
-
WHERE `id` NOT IN (
|
284
|
-
SELECT `min_id` FROM `{database}`.`{temp_table}`
|
285
|
-
) AND ({' OR '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
|
286
|
-
"""
|
287
|
-
|
288
322
|
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
289
|
-
|
290
323
|
with self._get_connection() as conn:
|
291
324
|
with conn.cursor() as cursor:
|
292
|
-
|
325
|
+
logger.debug('创建临时表SQL', {'sql': create_temp_sql})
|
293
326
|
cursor.execute(create_temp_sql)
|
327
|
+
logger.debug('统计临时表重复组SQL', {'sql': f'SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`'})
|
294
328
|
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`")
|
295
|
-
|
296
|
-
|
329
|
+
dup_count_row = cursor.fetchone()
|
330
|
+
dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
|
297
331
|
if dup_count == 0:
|
298
|
-
logger.info(
|
332
|
+
logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns})
|
333
|
+
logger.debug('删除临时表SQL', {'sql': drop_temp_sql})
|
299
334
|
cursor.execute(drop_temp_sql)
|
300
335
|
conn.commit()
|
301
336
|
return (0, 0)
|
302
|
-
|
303
|
-
logger.info(
|
304
|
-
f"表 {database}.{table} 发现 {dup_count} 组重复数据",
|
305
|
-
{'columns': use_columns}
|
306
|
-
)
|
307
|
-
|
337
|
+
affected_rows = 0
|
308
338
|
if not dry_run:
|
309
|
-
#
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
{'
|
316
|
-
|
339
|
+
# 分批删除,避免锁表
|
340
|
+
while True:
|
341
|
+
delete_dup_sql = f"""
|
342
|
+
DELETE FROM `{database}`.`{table}`
|
343
|
+
WHERE `{pk_real}` NOT IN (
|
344
|
+
SELECT `min_id` FROM `{database}`.`{temp_table}`
|
345
|
+
) {'AND' if use_time_filter else ''} {f'`{time_col}` >= \'{self._dedup_start_date}\' AND `{time_col}` <= \'{self._dedup_end_date}\'' if use_time_filter else ''}
|
346
|
+
AND ({' AND '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
|
347
|
+
LIMIT {self.batch_size}
|
348
|
+
"""
|
349
|
+
logger.debug('执行删除重复数据SQL', {'sql': delete_dup_sql})
|
350
|
+
cursor.execute(delete_dup_sql)
|
351
|
+
batch_deleted = cursor.rowcount
|
352
|
+
affected_rows += batch_deleted
|
353
|
+
conn.commit()
|
354
|
+
if batch_deleted < self.batch_size:
|
355
|
+
break
|
356
|
+
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns})
|
317
357
|
else:
|
358
|
+
logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组数": dup_count, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None})
|
318
359
|
affected_rows = 0
|
319
|
-
|
320
|
-
f"[模拟运行] 表 {database}.{table} 将删除 {dup_count} 组重复数据",
|
321
|
-
{'columns': use_columns}
|
322
|
-
)
|
323
|
-
|
324
|
-
# 清理临时表
|
360
|
+
logger.debug('删除临时表SQL', {'sql': drop_temp_sql})
|
325
361
|
cursor.execute(drop_temp_sql)
|
326
362
|
conn.commit()
|
327
|
-
|
328
363
|
return (dup_count, affected_rows)
|
329
|
-
|
330
364
|
except Exception as e:
|
331
|
-
logger.error(
|
332
|
-
|
333
|
-
|
334
|
-
|
365
|
+
logger.error('异常', {"库": database, "表": table, "异常": str(e), 'func': sys._getframe().f_code.co_name, 'traceback': repr(e)})
|
366
|
+
# 异常时也要清理临时表
|
367
|
+
if temp_table:
|
368
|
+
try:
|
369
|
+
with self._get_connection() as conn:
|
370
|
+
with conn.cursor() as cursor:
|
371
|
+
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
372
|
+
cursor.execute(drop_temp_sql)
|
373
|
+
conn.commit()
|
374
|
+
except Exception as drop_e:
|
375
|
+
logger.error('异常时清理临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
335
376
|
return (0, 0)
|
336
377
|
finally:
|
337
378
|
self._release_table_lock(database, table)
|
@@ -353,17 +394,15 @@ class MySQLDeduplicator:
|
|
353
394
|
:return: (重复行数, 删除行数)
|
354
395
|
"""
|
355
396
|
try:
|
356
|
-
# 检查表是否存在
|
357
397
|
if not self._check_table_exists(database, table):
|
358
|
-
logger.warning(
|
398
|
+
logger.warning('表不存在', {"库": database, "表": table, "warning": "跳过"})
|
359
399
|
return (0, 0)
|
360
|
-
|
361
|
-
|
400
|
+
logger.info('单表开始', {"库": database, "表": table, "参数": {"指定去重列": columns, "模拟运行": dry_run, '排除列': self.exclude_columns}})
|
401
|
+
result = self._deduplicate_table(database, table, columns, dry_run)
|
402
|
+
logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result})
|
403
|
+
return result
|
362
404
|
except Exception as e:
|
363
|
-
logger.error(
|
364
|
-
f"处理表 {database}.{table} 时发生全局错误: {str(e)}",
|
365
|
-
{'error_type': type(e).__name__}
|
366
|
-
)
|
405
|
+
logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
|
367
406
|
return (0, 0)
|
368
407
|
|
369
408
|
def deduplicate_database(
|
@@ -385,49 +424,40 @@ class MySQLDeduplicator:
|
|
385
424
|
:return: 字典 {表名: (重复行数, 删除行数)}
|
386
425
|
"""
|
387
426
|
results = {}
|
388
|
-
|
389
427
|
try:
|
390
|
-
# 检查数据库是否存在
|
391
428
|
if not self._check_database_exists(database):
|
392
|
-
logger.warning(
|
429
|
+
logger.warning('数据库不存在', {"库": database})
|
393
430
|
return results
|
394
|
-
|
395
|
-
# 获取要处理的表
|
396
431
|
target_tables = tables or self._get_tables(database)
|
432
|
+
logger.debug('获取目标表', {'库': database, 'tables': target_tables})
|
397
433
|
if not target_tables:
|
398
|
-
logger.info(
|
434
|
+
logger.info('数据库中没有表', {"库": database, "操作": "跳过"})
|
399
435
|
return results
|
400
|
-
|
401
|
-
logger.info(
|
402
|
-
f"开始处理数据库 {database} 中的 {len(target_tables)} 张表",
|
403
|
-
{'tables': target_tables}
|
404
|
-
)
|
405
|
-
|
436
|
+
logger.info('库统计', {"库": database, "表数量": len(target_tables), "表列表": target_tables})
|
406
437
|
if parallel and self.max_workers > 1:
|
407
|
-
|
438
|
+
logger.debug('并行处理表', {'库': database, 'max_workers': self.max_workers})
|
439
|
+
# 使用线程池并行处理
|
408
440
|
with concurrent.futures.ThreadPoolExecutor(
|
409
441
|
max_workers=self.max_workers
|
410
442
|
) as executor:
|
411
443
|
futures = {}
|
412
444
|
for table in target_tables:
|
413
445
|
columns = columns_map.get(table) if columns_map else None
|
446
|
+
logger.debug('提交表去重任务', {'库': database, '表': table, 'columns': columns})
|
414
447
|
futures[executor.submit(
|
415
448
|
self.deduplicate_table,
|
416
449
|
database, table, columns, dry_run
|
417
450
|
)] = table
|
418
|
-
|
419
451
|
for future in concurrent.futures.as_completed(futures):
|
420
452
|
table = futures[future]
|
421
453
|
try:
|
422
454
|
dup_count, affected_rows = future.result()
|
423
455
|
results[table] = (dup_count, affected_rows)
|
424
456
|
except Exception as e:
|
425
|
-
logger.error(
|
426
|
-
f"处理表 {database}.{table} 时出错: {str(e)}",
|
427
|
-
{'error_type': type(e).__name__}
|
428
|
-
)
|
457
|
+
logger.error('异常', {"库": database, "表": table, "error": str(e), 'traceback': repr(e)})
|
429
458
|
results[table] = (0, 0)
|
430
459
|
else:
|
460
|
+
logger.debug('串行处理表', {'库': database})
|
431
461
|
# 串行处理
|
432
462
|
for table in target_tables:
|
433
463
|
columns = columns_map.get(table) if columns_map else None
|
@@ -435,20 +465,12 @@ class MySQLDeduplicator:
|
|
435
465
|
database, table, columns, dry_run
|
436
466
|
)
|
437
467
|
results[table] = (dup_count, affected_rows)
|
438
|
-
|
439
|
-
# 统计结果
|
440
468
|
total_dup = sum(r[0] for r in results.values())
|
441
469
|
total_del = sum(r[1] for r in results.values())
|
442
|
-
|
443
|
-
logger.info(
|
444
|
-
f"数据库 {database} 处理完成 - 共发现 {total_dup} 组重复数据,删除 {total_del} 行",
|
445
|
-
{'results': results}
|
446
|
-
)
|
447
|
-
|
470
|
+
logger.info('单库完成', {"库": database, "重复组数": total_dup, "总删除行数": total_del, "详细结果": results})
|
448
471
|
return results
|
449
|
-
|
450
472
|
except Exception as e:
|
451
|
-
logger.error(
|
473
|
+
logger.error('发生全局错误', {"库": database, 'func': sys._getframe().f_code.co_name, "error": str(e), 'traceback': repr(e)})
|
452
474
|
return results
|
453
475
|
|
454
476
|
def deduplicate_all(
|
@@ -470,18 +492,15 @@ class MySQLDeduplicator:
|
|
470
492
|
:return: 嵌套字典 {数据库名: {表名: (重复行数, 删除行数)}}
|
471
493
|
"""
|
472
494
|
all_results = defaultdict(dict)
|
473
|
-
|
474
495
|
try:
|
475
|
-
# 获取要处理的数据库
|
476
496
|
target_dbs = databases or self._get_databases()
|
497
|
+
logger.debug('获取目标数据库', {'databases': target_dbs})
|
477
498
|
if not target_dbs:
|
478
|
-
logger.warning(
|
499
|
+
logger.warning('没有可处理的数据库')
|
479
500
|
return all_results
|
480
|
-
|
481
|
-
logger.info(f"开始处理 {len(target_dbs)} 个数据库", {'databases': target_dbs})
|
482
|
-
|
501
|
+
logger.info('全局开始', {"数据库数量": len(target_dbs), "数据库列表": target_dbs, "参数": {"模拟运行": dry_run, "并行处理": parallel, '排除列': self.exclude_columns}})
|
483
502
|
if parallel and self.max_workers > 1:
|
484
|
-
#
|
503
|
+
# 使用线程池并行处理多个数据库
|
485
504
|
with concurrent.futures.ThreadPoolExecutor(
|
486
505
|
max_workers=self.max_workers
|
487
506
|
) as executor:
|
@@ -493,14 +512,13 @@ class MySQLDeduplicator:
|
|
493
512
|
self.deduplicate_database,
|
494
513
|
db, tables, db_columns_map, dry_run, False
|
495
514
|
)] = db
|
496
|
-
|
497
515
|
for future in concurrent.futures.as_completed(futures):
|
498
516
|
db = futures[future]
|
499
517
|
try:
|
500
518
|
db_results = future.result()
|
501
519
|
all_results[db] = db_results
|
502
520
|
except Exception as e:
|
503
|
-
logger.error(
|
521
|
+
logger.error('异常', {"库": db, "error": str(e), 'traceback': repr(e)})
|
504
522
|
all_results[db] = {}
|
505
523
|
else:
|
506
524
|
# 串行处理数据库
|
@@ -511,8 +529,6 @@ class MySQLDeduplicator:
|
|
511
529
|
db, tables, db_columns_map, dry_run, parallel
|
512
530
|
)
|
513
531
|
all_results[db] = db_results
|
514
|
-
|
515
|
-
# 统计总体结果
|
516
532
|
total_dup = sum(
|
517
533
|
r[0] for db in all_results.values()
|
518
534
|
for r in db.values()
|
@@ -521,16 +537,10 @@ class MySQLDeduplicator:
|
|
521
537
|
r[1] for db in all_results.values()
|
522
538
|
for r in db.values()
|
523
539
|
)
|
524
|
-
|
525
|
-
logger.info(
|
526
|
-
f"所有数据库处理完成 - 共发现 {total_dup} 组重复数据,删除 {total_del} 行",
|
527
|
-
{'total_results': all_results}
|
528
|
-
)
|
529
|
-
|
540
|
+
logger.info('全局完成', {"总重复组数": total_dup, "总删除行数": total_del, "详细结果": dict(all_results)})
|
530
541
|
return all_results
|
531
|
-
|
532
542
|
except Exception as e:
|
533
|
-
logger.error(
|
543
|
+
logger.error('异常', {"error": str(e), 'traceback': repr(e)})
|
534
544
|
return all_results
|
535
545
|
|
536
546
|
@_retry_on_failure
|
@@ -557,42 +567,46 @@ class MySQLDeduplicator:
|
|
557
567
|
cursor.execute(sql, (database, table))
|
558
568
|
return bool(cursor.fetchone())
|
559
569
|
|
560
|
-
def close(self):
|
570
|
+
def close(self) -> None:
|
561
571
|
"""关闭连接池"""
|
562
572
|
try:
|
563
573
|
if hasattr(self, 'pool') and self.pool and not self._closed:
|
564
574
|
self.pool.close()
|
565
575
|
self._closed = True
|
566
576
|
logger.info("数据库连接池已关闭")
|
577
|
+
else:
|
578
|
+
logger.info('连接池已关闭或不存在')
|
567
579
|
except Exception as e:
|
568
|
-
logger.error(f"
|
580
|
+
logger.error(f"关闭连接池时出错", {'error_type': type(e).__name__, 'error': str(e)})
|
569
581
|
|
570
|
-
def __enter__(self):
|
582
|
+
def __enter__(self) -> 'MySQLDeduplicator':
|
571
583
|
return self
|
572
584
|
|
573
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
585
|
+
def __exit__(self, exc_type: Optional[type], exc_val: Optional[BaseException], exc_tb: Optional[Any]) -> None:
|
574
586
|
self.close()
|
575
587
|
|
576
588
|
|
577
589
|
def main():
|
578
590
|
deduplicator = MySQLDeduplicator(
|
579
591
|
username='root',
|
580
|
-
password='
|
592
|
+
password='pwd',
|
581
593
|
host='localhost',
|
582
594
|
port=3306
|
583
595
|
)
|
584
596
|
|
585
597
|
# 全库去重(单线程)
|
586
|
-
deduplicator.deduplicate_all()
|
598
|
+
deduplicator.deduplicate_all(dry_run=False, parallel=False)
|
587
599
|
|
588
600
|
# # 指定数据库去重(多线程)
|
589
|
-
#
|
601
|
+
# logger.info('调用deduplicate_database')
|
602
|
+
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True)
|
590
603
|
|
591
604
|
# # 指定表去重(使用特定列)
|
592
|
-
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
|
605
|
+
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False)
|
593
606
|
|
594
607
|
# 关闭连接
|
595
608
|
deduplicator.close()
|
596
609
|
|
597
610
|
if __name__ == '__main__':
|
598
|
-
main()
|
611
|
+
# main()
|
612
|
+
pass
|