mdbq 3.10.9__py3-none-any.whl → 3.10.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/deduplicator.py +151 -145
- {mdbq-3.10.9.dist-info → mdbq-3.10.10.dist-info}/METADATA +1 -1
- {mdbq-3.10.9.dist-info → mdbq-3.10.10.dist-info}/RECORD +6 -6
- {mdbq-3.10.9.dist-info → mdbq-3.10.10.dist-info}/WHEEL +0 -0
- {mdbq-3.10.9.dist-info → mdbq-3.10.10.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.10.
|
1
|
+
VERSION = '3.10.10'
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
# -*- coding:utf-8 -*-
|
2
|
-
import datetime
|
3
2
|
import re
|
4
3
|
import time
|
5
4
|
from functools import wraps
|
@@ -7,11 +6,12 @@ import warnings
|
|
7
6
|
import pymysql
|
8
7
|
import os
|
9
8
|
from mdbq.log import mylogger
|
10
|
-
from typing import List, Dict, Optional, Any, Tuple
|
9
|
+
from typing import List, Dict, Optional, Any, Tuple
|
11
10
|
from dbutils.pooled_db import PooledDB
|
12
11
|
import threading
|
13
12
|
import concurrent.futures
|
14
13
|
from collections import defaultdict
|
14
|
+
import sys
|
15
15
|
|
16
16
|
|
17
17
|
warnings.filterwarnings('ignore')
|
@@ -73,27 +73,27 @@ class MySQLDeduplicator:
|
|
73
73
|
max_retries: int = 3,
|
74
74
|
retry_interval: int = 5,
|
75
75
|
pool_size: int = 5,
|
76
|
-
primary_key: str = 'id'
|
76
|
+
primary_key: str = 'id',
|
77
|
+
date_range: Optional[List[str]] = None,
|
78
|
+
recent_month: Optional[int] = None,
|
79
|
+
date_column: str = '日期',
|
80
|
+
exclude_columns: Optional[List[str]] = None
|
77
81
|
) -> None:
|
78
82
|
"""
|
79
83
|
初始化去重处理器
|
80
|
-
|
81
|
-
:param
|
82
|
-
:param
|
83
|
-
:param
|
84
|
-
:param
|
85
|
-
:param charset: 字符集,默认为utf8mb4
|
86
|
-
:param max_workers: 最大工作线程数,默认为1(单线程)
|
87
|
-
:param batch_size: 批量处理大小,默认为1000
|
88
|
-
:param skip_system_dbs: 是否跳过系统数据库,默认为True
|
89
|
-
:param max_retries: 最大重试次数
|
90
|
-
:param retry_interval: 重试间隔(秒)
|
91
|
-
:param pool_size: 连接池大小
|
92
|
-
:param primary_key: 主键列名,默认为'id'
|
84
|
+
新增参数:
|
85
|
+
:param date_range: 指定去重的日期区间 [start_date, end_date],格式'YYYY-MM-DD'
|
86
|
+
:param recent_month: 最近N个月的数据去重(与date_range互斥,优先生效)
|
87
|
+
:param date_column: 时间列名,默认为'日期'
|
88
|
+
:param exclude_columns: 去重时排除的列名列表,默认为['id', '更新时间']
|
93
89
|
"""
|
94
90
|
# 连接池状态标志
|
95
91
|
self._closed = False
|
96
|
-
|
92
|
+
logger.debug('初始化MySQLDeduplicator', {
|
93
|
+
'host': host, 'port': port, 'user': username, 'charset': charset,
|
94
|
+
'max_workers': max_workers, 'batch_size': batch_size, 'pool_size': pool_size,
|
95
|
+
'exclude_columns': exclude_columns
|
96
|
+
})
|
97
97
|
# 初始化连接池
|
98
98
|
self.pool = PooledDB(
|
99
99
|
creator=pymysql,
|
@@ -114,6 +114,33 @@ class MySQLDeduplicator:
|
|
114
114
|
self.retry_interval = retry_interval
|
115
115
|
self.primary_key = primary_key
|
116
116
|
|
117
|
+
# 时间范围参数
|
118
|
+
self.date_range = date_range
|
119
|
+
self.recent_month = recent_month
|
120
|
+
self.date_column = date_column
|
121
|
+
self._dedup_start_date = None
|
122
|
+
self._dedup_end_date = None
|
123
|
+
# 不管 exclude_columns 是否传入, 'id' 一定会被排除
|
124
|
+
default_exclude = {'id'}
|
125
|
+
# exclude_columns 不传则排除: ['id', '更新时间']
|
126
|
+
if not exclude_columns:
|
127
|
+
self.exclude_columns = list(default_exclude | {'更新时间'})
|
128
|
+
else:
|
129
|
+
self.exclude_columns = list(set(exclude_columns) | default_exclude)
|
130
|
+
# 解析时间范围
|
131
|
+
if self.date_range and len(self.date_range) == 2:
|
132
|
+
self._dedup_start_date, self._dedup_end_date = self.date_range
|
133
|
+
elif self.recent_month:
|
134
|
+
from datetime import datetime, timedelta
|
135
|
+
today = datetime.today()
|
136
|
+
month = today.month - self.recent_month
|
137
|
+
year = today.year
|
138
|
+
while month <= 0:
|
139
|
+
month += 12
|
140
|
+
year -= 1
|
141
|
+
self._dedup_start_date = f"{year}-{month:02d}-01"
|
142
|
+
self._dedup_end_date = today.strftime("%Y-%m-%d")
|
143
|
+
|
117
144
|
# 线程安全控制
|
118
145
|
self._lock = threading.Lock()
|
119
146
|
self._processing_tables = set() # 正在处理的表集合
|
@@ -124,24 +151,25 @@ class MySQLDeduplicator:
|
|
124
151
|
def _get_connection(self) -> pymysql.connections.Connection:
|
125
152
|
"""从连接池获取连接"""
|
126
153
|
if self._closed:
|
154
|
+
logger.error('尝试获取连接但连接池已关闭')
|
127
155
|
raise ConnectionError("连接池已关闭")
|
128
156
|
try:
|
129
157
|
conn = self.pool.connection()
|
130
158
|
logger.debug("成功获取数据库连接")
|
131
159
|
return conn
|
132
160
|
except Exception as e:
|
133
|
-
logger.error(f"获取数据库连接失败: {str(e)}")
|
161
|
+
logger.error(f"获取数据库连接失败: {str(e)}", {'error_type': type(e).__name__})
|
134
162
|
raise ConnectionError(f"连接数据库失败: {str(e)}")
|
135
163
|
|
136
164
|
@staticmethod
|
137
165
|
def _retry_on_failure(func: Any) -> Any:
|
138
166
|
"""重试装饰器"""
|
139
|
-
|
140
167
|
@wraps(func)
|
141
168
|
def wrapper(self, *args, **kwargs):
|
142
169
|
last_exception = None
|
143
170
|
for attempt in range(self.max_retries + 1):
|
144
171
|
try:
|
172
|
+
logger.debug(f'调用{func.__name__},第{attempt+1}次尝试', {'args': args, 'kwargs': kwargs})
|
145
173
|
return func(self, *args, **kwargs)
|
146
174
|
except (pymysql.OperationalError, pymysql.InterfaceError) as e:
|
147
175
|
last_exception = e
|
@@ -149,18 +177,17 @@ class MySQLDeduplicator:
|
|
149
177
|
wait_time = self.retry_interval * (attempt + 1)
|
150
178
|
logger.warning(
|
151
179
|
f"数据库操作失败,准备重试 (尝试 {attempt + 1}/{self.max_retries})",
|
152
|
-
{'error': str(e), 'wait_time': wait_time})
|
180
|
+
{'error': str(e), 'wait_time': wait_time, 'func': func.__name__})
|
153
181
|
time.sleep(wait_time)
|
154
182
|
continue
|
155
183
|
except Exception as e:
|
156
184
|
last_exception = e
|
157
|
-
logger.error(f"操作失败: {str(e)}", {'error_type': type(e).__name__})
|
185
|
+
logger.error(f"操作失败: {str(e)}", {'error_type': type(e).__name__, 'func': func.__name__})
|
158
186
|
break
|
159
|
-
|
160
187
|
if last_exception:
|
188
|
+
logger.error('重试后依然失败', {'func': func.__name__, 'last_exception': str(last_exception)})
|
161
189
|
raise last_exception
|
162
190
|
raise Exception("未知错误")
|
163
|
-
|
164
191
|
return wrapper
|
165
192
|
|
166
193
|
@_retry_on_failure
|
@@ -241,104 +268,111 @@ class MySQLDeduplicator:
|
|
241
268
|
"""
|
242
269
|
if not self._acquire_table_lock(database, table):
|
243
270
|
return (0, 0)
|
244
|
-
|
271
|
+
temp_table = None
|
245
272
|
try:
|
246
|
-
|
247
|
-
|
273
|
+
# 获取原始数据总量
|
274
|
+
with self._get_connection() as conn:
|
275
|
+
with conn.cursor() as cursor:
|
276
|
+
logger.debug('执行SQL', {'sql': f'SELECT COUNT(*) as cnt FROM `{database}`.`{table}`'})
|
277
|
+
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`")
|
278
|
+
total_count_row = cursor.fetchone()
|
279
|
+
total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
|
280
|
+
logger.info('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name})
|
248
281
|
# 获取实际列名
|
249
282
|
all_columns = self._get_table_columns(database, table)
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
283
|
+
logger.debug('获取表列', {'库': database, '表': table, 'all_columns': all_columns})
|
284
|
+
# 检查是否需要按时间范围过滤
|
285
|
+
use_time_filter = False
|
286
|
+
time_col = self.date_column
|
287
|
+
all_columns_lower = [col.lower() for col in all_columns]
|
288
|
+
# 排除exclude_columns
|
289
|
+
exclude_columns_lower = [col.lower() for col in getattr(self, 'exclude_columns', [])]
|
290
|
+
# 统一列名小写做判断
|
255
291
|
use_columns = columns or all_columns
|
256
|
-
|
257
|
-
|
292
|
+
use_columns = [col for col in use_columns if col.lower() in all_columns_lower and col.lower() not in exclude_columns_lower]
|
293
|
+
invalid_columns = set([col for col in (columns or []) if col.lower() not in all_columns_lower])
|
258
294
|
if invalid_columns:
|
259
|
-
logger.warning(
|
260
|
-
f"表 {database}.{table} 中不存在以下列: {invalid_columns},使用有效列",
|
261
|
-
{'invalid_columns': list(invalid_columns)}
|
262
|
-
)
|
263
|
-
use_columns = [col for col in use_columns if col in all_columns]
|
264
|
-
|
295
|
+
logger.warning('不存在的列', {"库": database, "表": table, "不存在以下列": invalid_columns, 'func': sys._getframe().f_code.co_name})
|
265
296
|
if not use_columns:
|
266
|
-
logger.error(
|
297
|
+
logger.error('没有有效的去重列', {"库": database, "表": table})
|
267
298
|
return (0, 0)
|
268
|
-
|
269
|
-
# 构建去重SQL
|
299
|
+
# 统一用反引号包裹
|
270
300
|
column_list = ', '.join([f'`{col}`' for col in use_columns])
|
271
|
-
|
272
|
-
temp_table = f"temp_{table}_dedup_{os.getpid()}"
|
301
|
+
temp_table = f"temp_{table}_dedup_{os.getpid()}_{threading.get_ident()}"
|
273
302
|
temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)[:60]
|
274
303
|
pk = self.primary_key
|
275
|
-
#
|
276
|
-
if pk not in
|
277
|
-
logger.error(
|
304
|
+
# 主键判断也用小写
|
305
|
+
if pk.lower() not in all_columns_lower and pk != 'id':
|
306
|
+
logger.error('', {"不存在主键列": database, "表": table, "主键列不存在": pk})
|
278
307
|
return (0, 0)
|
308
|
+
# 找到实际主键名
|
309
|
+
pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
|
310
|
+
# 构造where条件
|
311
|
+
where_time = ''
|
312
|
+
if use_time_filter:
|
313
|
+
where_time = f"WHERE `{time_col}` >= '{self._dedup_start_date}' AND `{time_col}` <= '{self._dedup_end_date}'"
|
279
314
|
create_temp_sql = f"""
|
280
315
|
CREATE TABLE `{database}`.`{temp_table}` AS
|
281
|
-
SELECT MIN(`{
|
316
|
+
SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
|
282
317
|
FROM `{database}`.`{table}`
|
318
|
+
{where_time}
|
283
319
|
GROUP BY {column_list}
|
284
320
|
HAVING COUNT(*) > 1
|
285
321
|
"""
|
286
|
-
|
287
|
-
delete_dup_sql = f"""
|
288
|
-
DELETE FROM `{database}`.`{table}`
|
289
|
-
WHERE `{pk}` NOT IN (
|
290
|
-
SELECT `min_id` FROM `{database}`.`{temp_table}`
|
291
|
-
) AND ({' OR '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
|
292
|
-
"""
|
293
|
-
|
294
322
|
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
295
|
-
|
296
323
|
with self._get_connection() as conn:
|
297
324
|
with conn.cursor() as cursor:
|
298
|
-
|
325
|
+
logger.debug('创建临时表SQL', {'sql': create_temp_sql})
|
299
326
|
cursor.execute(create_temp_sql)
|
327
|
+
logger.debug('统计临时表重复组SQL', {'sql': f'SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`'})
|
300
328
|
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`")
|
301
329
|
dup_count_row = cursor.fetchone()
|
302
330
|
dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
|
303
|
-
|
304
331
|
if dup_count == 0:
|
305
|
-
logger.info(
|
332
|
+
logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns})
|
333
|
+
logger.debug('删除临时表SQL', {'sql': drop_temp_sql})
|
306
334
|
cursor.execute(drop_temp_sql)
|
307
335
|
conn.commit()
|
308
336
|
return (0, 0)
|
309
|
-
|
310
|
-
logger.info(
|
311
|
-
f"表 {database}.{table} 发现 {dup_count} 组重复数据",
|
312
|
-
{'columns': use_columns}
|
313
|
-
)
|
314
|
-
|
337
|
+
affected_rows = 0
|
315
338
|
if not dry_run:
|
316
|
-
#
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
{'
|
323
|
-
|
339
|
+
# 分批删除,避免锁表
|
340
|
+
while True:
|
341
|
+
delete_dup_sql = f"""
|
342
|
+
DELETE FROM `{database}`.`{table}`
|
343
|
+
WHERE `{pk_real}` NOT IN (
|
344
|
+
SELECT `min_id` FROM `{database}`.`{temp_table}`
|
345
|
+
) {'AND' if use_time_filter else ''} {f'`{time_col}` >= \'{self._dedup_start_date}\' AND `{time_col}` <= \'{self._dedup_end_date}\'' if use_time_filter else ''}
|
346
|
+
AND ({' AND '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
|
347
|
+
LIMIT {self.batch_size}
|
348
|
+
"""
|
349
|
+
logger.debug('执行删除重复数据SQL', {'sql': delete_dup_sql})
|
350
|
+
cursor.execute(delete_dup_sql)
|
351
|
+
batch_deleted = cursor.rowcount
|
352
|
+
affected_rows += batch_deleted
|
353
|
+
conn.commit()
|
354
|
+
if batch_deleted < self.batch_size:
|
355
|
+
break
|
356
|
+
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns})
|
324
357
|
else:
|
358
|
+
logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组数": dup_count, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None})
|
325
359
|
affected_rows = 0
|
326
|
-
|
327
|
-
f"[模拟运行] 表 {database}.{table} 将删除 {dup_count} 组重复数据",
|
328
|
-
{'columns': use_columns}
|
329
|
-
)
|
330
|
-
|
331
|
-
# 清理临时表
|
360
|
+
logger.debug('删除临时表SQL', {'sql': drop_temp_sql})
|
332
361
|
cursor.execute(drop_temp_sql)
|
333
362
|
conn.commit()
|
334
|
-
|
335
363
|
return (dup_count, affected_rows)
|
336
|
-
|
337
364
|
except Exception as e:
|
338
|
-
logger.error(
|
339
|
-
|
340
|
-
|
341
|
-
|
365
|
+
logger.error('异常', {"库": database, "表": table, "异常": str(e), 'func': sys._getframe().f_code.co_name, 'traceback': repr(e)})
|
366
|
+
# 异常时也要清理临时表
|
367
|
+
if temp_table:
|
368
|
+
try:
|
369
|
+
with self._get_connection() as conn:
|
370
|
+
with conn.cursor() as cursor:
|
371
|
+
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
372
|
+
cursor.execute(drop_temp_sql)
|
373
|
+
conn.commit()
|
374
|
+
except Exception as drop_e:
|
375
|
+
logger.error('异常时清理临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
342
376
|
return (0, 0)
|
343
377
|
finally:
|
344
378
|
self._release_table_lock(database, table)
|
@@ -360,17 +394,15 @@ class MySQLDeduplicator:
|
|
360
394
|
:return: (重复行数, 删除行数)
|
361
395
|
"""
|
362
396
|
try:
|
363
|
-
# 检查表是否存在
|
364
397
|
if not self._check_table_exists(database, table):
|
365
|
-
logger.warning(
|
398
|
+
logger.warning('表不存在', {"库": database, "表": table, "warning": "跳过"})
|
366
399
|
return (0, 0)
|
367
|
-
|
368
|
-
|
400
|
+
logger.info('单表开始', {"库": database, "表": table, "参数": {"指定去重列": columns, "模拟运行": dry_run, '排除列': self.exclude_columns}})
|
401
|
+
result = self._deduplicate_table(database, table, columns, dry_run)
|
402
|
+
logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result})
|
403
|
+
return result
|
369
404
|
except Exception as e:
|
370
|
-
logger.error(
|
371
|
-
f"处理表 {database}.{table} 时发生全局错误: {str(e)}",
|
372
|
-
{'error_type': type(e).__name__}
|
373
|
-
)
|
405
|
+
logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
|
374
406
|
return (0, 0)
|
375
407
|
|
376
408
|
def deduplicate_database(
|
@@ -392,49 +424,40 @@ class MySQLDeduplicator:
|
|
392
424
|
:return: 字典 {表名: (重复行数, 删除行数)}
|
393
425
|
"""
|
394
426
|
results = {}
|
395
|
-
|
396
427
|
try:
|
397
|
-
# 检查数据库是否存在
|
398
428
|
if not self._check_database_exists(database):
|
399
|
-
logger.warning(
|
429
|
+
logger.warning('数据库不存在', {"库": database})
|
400
430
|
return results
|
401
|
-
|
402
|
-
# 获取要处理的表
|
403
431
|
target_tables = tables or self._get_tables(database)
|
432
|
+
logger.debug('获取目标表', {'库': database, 'tables': target_tables})
|
404
433
|
if not target_tables:
|
405
|
-
logger.info(
|
434
|
+
logger.info('数据库中没有表', {"库": database, "操作": "跳过"})
|
406
435
|
return results
|
407
|
-
|
408
|
-
logger.info(
|
409
|
-
f"开始处理数据库 {database} 中的 {len(target_tables)} 张表",
|
410
|
-
{'tables': target_tables}
|
411
|
-
)
|
412
|
-
|
436
|
+
logger.info('库统计', {"库": database, "表数量": len(target_tables), "表列表": target_tables})
|
413
437
|
if parallel and self.max_workers > 1:
|
414
|
-
|
438
|
+
logger.debug('并行处理表', {'库': database, 'max_workers': self.max_workers})
|
439
|
+
# 使用线程池并行处理
|
415
440
|
with concurrent.futures.ThreadPoolExecutor(
|
416
441
|
max_workers=self.max_workers
|
417
442
|
) as executor:
|
418
443
|
futures = {}
|
419
444
|
for table in target_tables:
|
420
445
|
columns = columns_map.get(table) if columns_map else None
|
446
|
+
logger.debug('提交表去重任务', {'库': database, '表': table, 'columns': columns})
|
421
447
|
futures[executor.submit(
|
422
448
|
self.deduplicate_table,
|
423
449
|
database, table, columns, dry_run
|
424
450
|
)] = table
|
425
|
-
|
426
451
|
for future in concurrent.futures.as_completed(futures):
|
427
452
|
table = futures[future]
|
428
453
|
try:
|
429
454
|
dup_count, affected_rows = future.result()
|
430
455
|
results[table] = (dup_count, affected_rows)
|
431
456
|
except Exception as e:
|
432
|
-
logger.error(
|
433
|
-
f"处理表 {database}.{table} 时出错: {str(e)}",
|
434
|
-
{'error_type': type(e).__name__}
|
435
|
-
)
|
457
|
+
logger.error('异常', {"库": database, "表": table, "error": str(e), 'traceback': repr(e)})
|
436
458
|
results[table] = (0, 0)
|
437
459
|
else:
|
460
|
+
logger.debug('串行处理表', {'库': database})
|
438
461
|
# 串行处理
|
439
462
|
for table in target_tables:
|
440
463
|
columns = columns_map.get(table) if columns_map else None
|
@@ -442,20 +465,12 @@ class MySQLDeduplicator:
|
|
442
465
|
database, table, columns, dry_run
|
443
466
|
)
|
444
467
|
results[table] = (dup_count, affected_rows)
|
445
|
-
|
446
|
-
# 统计结果
|
447
468
|
total_dup = sum(r[0] for r in results.values())
|
448
469
|
total_del = sum(r[1] for r in results.values())
|
449
|
-
|
450
|
-
logger.info(
|
451
|
-
f"数据库 {database} 处理完成 - 共发现 {total_dup} 组重复数据,删除 {total_del} 行",
|
452
|
-
{'results': results}
|
453
|
-
)
|
454
|
-
|
470
|
+
logger.info('单库完成', {"库": database, "重复组数": total_dup, "总删除行数": total_del, "详细结果": results})
|
455
471
|
return results
|
456
|
-
|
457
472
|
except Exception as e:
|
458
|
-
logger.error(
|
473
|
+
logger.error('发生全局错误', {"库": database, 'func': sys._getframe().f_code.co_name, "error": str(e), 'traceback': repr(e)})
|
459
474
|
return results
|
460
475
|
|
461
476
|
def deduplicate_all(
|
@@ -477,18 +492,15 @@ class MySQLDeduplicator:
|
|
477
492
|
:return: 嵌套字典 {数据库名: {表名: (重复行数, 删除行数)}}
|
478
493
|
"""
|
479
494
|
all_results = defaultdict(dict)
|
480
|
-
|
481
495
|
try:
|
482
|
-
# 获取要处理的数据库
|
483
496
|
target_dbs = databases or self._get_databases()
|
497
|
+
logger.debug('获取目标数据库', {'databases': target_dbs})
|
484
498
|
if not target_dbs:
|
485
|
-
logger.warning(
|
499
|
+
logger.warning('没有可处理的数据库')
|
486
500
|
return all_results
|
487
|
-
|
488
|
-
logger.info(f"开始处理 {len(target_dbs)} 个数据库", {'databases': target_dbs})
|
489
|
-
|
501
|
+
logger.info('全局开始', {"数据库数量": len(target_dbs), "数据库列表": target_dbs, "参数": {"模拟运行": dry_run, "并行处理": parallel, '排除列': self.exclude_columns}})
|
490
502
|
if parallel and self.max_workers > 1:
|
491
|
-
#
|
503
|
+
# 使用线程池并行处理多个数据库
|
492
504
|
with concurrent.futures.ThreadPoolExecutor(
|
493
505
|
max_workers=self.max_workers
|
494
506
|
) as executor:
|
@@ -500,14 +512,13 @@ class MySQLDeduplicator:
|
|
500
512
|
self.deduplicate_database,
|
501
513
|
db, tables, db_columns_map, dry_run, False
|
502
514
|
)] = db
|
503
|
-
|
504
515
|
for future in concurrent.futures.as_completed(futures):
|
505
516
|
db = futures[future]
|
506
517
|
try:
|
507
518
|
db_results = future.result()
|
508
519
|
all_results[db] = db_results
|
509
520
|
except Exception as e:
|
510
|
-
logger.error(
|
521
|
+
logger.error('异常', {"库": db, "error": str(e), 'traceback': repr(e)})
|
511
522
|
all_results[db] = {}
|
512
523
|
else:
|
513
524
|
# 串行处理数据库
|
@@ -518,8 +529,6 @@ class MySQLDeduplicator:
|
|
518
529
|
db, tables, db_columns_map, dry_run, parallel
|
519
530
|
)
|
520
531
|
all_results[db] = db_results
|
521
|
-
|
522
|
-
# 统计总体结果
|
523
532
|
total_dup = sum(
|
524
533
|
r[0] for db in all_results.values()
|
525
534
|
for r in db.values()
|
@@ -528,16 +537,10 @@ class MySQLDeduplicator:
|
|
528
537
|
r[1] for db in all_results.values()
|
529
538
|
for r in db.values()
|
530
539
|
)
|
531
|
-
|
532
|
-
logger.info(
|
533
|
-
f"所有数据库处理完成 - 共发现 {total_dup} 组重复数据,删除 {total_del} 行",
|
534
|
-
{'total_results': all_results}
|
535
|
-
)
|
536
|
-
|
540
|
+
logger.info('全局完成', {"总重复组数": total_dup, "总删除行数": total_del, "详细结果": dict(all_results)})
|
537
541
|
return all_results
|
538
|
-
|
539
542
|
except Exception as e:
|
540
|
-
logger.error(
|
543
|
+
logger.error('异常', {"error": str(e), 'traceback': repr(e)})
|
541
544
|
return all_results
|
542
545
|
|
543
546
|
@_retry_on_failure
|
@@ -571,8 +574,10 @@ class MySQLDeduplicator:
|
|
571
574
|
self.pool.close()
|
572
575
|
self._closed = True
|
573
576
|
logger.info("数据库连接池已关闭")
|
577
|
+
else:
|
578
|
+
logger.info('连接池已关闭或不存在')
|
574
579
|
except Exception as e:
|
575
|
-
logger.error(f"
|
580
|
+
logger.error(f"关闭连接池时出错", {'error_type': type(e).__name__, 'error': str(e)})
|
576
581
|
|
577
582
|
def __enter__(self) -> 'MySQLDeduplicator':
|
578
583
|
return self
|
@@ -590,13 +595,14 @@ def main():
|
|
590
595
|
)
|
591
596
|
|
592
597
|
# 全库去重(单线程)
|
593
|
-
deduplicator.deduplicate_all()
|
598
|
+
deduplicator.deduplicate_all(dry_run=False, parallel=False)
|
594
599
|
|
595
600
|
# # 指定数据库去重(多线程)
|
596
|
-
#
|
601
|
+
# logger.info('调用deduplicate_database')
|
602
|
+
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True)
|
597
603
|
|
598
604
|
# # 指定表去重(使用特定列)
|
599
|
-
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
|
605
|
+
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False)
|
600
606
|
|
601
607
|
# 关闭连接
|
602
608
|
deduplicator.close()
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=mvvcpn_eYjsZWNgQAvfJdawR8GlNJmr51SxpSdq4Ekc,19
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/optimize.py,sha256=zC_w_aVYXwmvfF0Z8iSGMmv5vptF0rP-Dz5zmp0gXTU,19820
|
5
5
|
mdbq/aggregation/query_data.py,sha256=fdotW8qdAyDB13p7r3p6AGBkavcHnf6hIvSMtcS7vqE,179875
|
@@ -9,7 +9,7 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
|
9
9
|
mdbq/log/mylogger.py,sha256=07sstIeaIQUJXwpMwmxppRI7kW7QwZFnv4Rr3UDlyUs,24133
|
10
10
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
11
11
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
12
|
-
mdbq/mysql/deduplicator.py,sha256=
|
12
|
+
mdbq/mysql/deduplicator.py,sha256=sm99eneNO7Br21BH-8vnZW3b7jA3gPF7c9Bvz04YV_g,27759
|
13
13
|
mdbq/mysql/mysql.py,sha256=Lfy9PsEdgmdRtcG_UUgegH3bFTJPhByTWkcAYl8G6m0,56788
|
14
14
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
15
15
|
mdbq/mysql/uploader.py,sha256=3RzslC10pNIYm-0NASicvCHXH0zgUXx7uD1jE21z_OU,64677
|
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
25
25
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
26
26
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
27
27
|
mdbq/spider/aikucun.py,sha256=YyPWa_nOH1zs8wgTDcgzn5w8szGKWPyWzmWMVIPkFnU,21638
|
28
|
-
mdbq-3.10.
|
29
|
-
mdbq-3.10.
|
30
|
-
mdbq-3.10.
|
31
|
-
mdbq-3.10.
|
28
|
+
mdbq-3.10.10.dist-info/METADATA,sha256=zIHTb2D1u7ZjwGM-zGMhGJbOTybYgzB30yjPCRBdW5w,365
|
29
|
+
mdbq-3.10.10.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
30
|
+
mdbq-3.10.10.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
31
|
+
mdbq-3.10.10.dist-info/RECORD,,
|
File without changes
|
File without changes
|