mdbq 3.11.7__py3-none-any.whl → 3.11.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/log/mylogger.py +1 -1
- mdbq/mysql/deduplicator.py +373 -226
- mdbq/mysql/uploader.py +49 -1
- {mdbq-3.11.7.dist-info → mdbq-3.11.9.dist-info}/METADATA +1 -1
- {mdbq-3.11.7.dist-info → mdbq-3.11.9.dist-info}/RECORD +8 -8
- {mdbq-3.11.7.dist-info → mdbq-3.11.9.dist-info}/WHEEL +0 -0
- {mdbq-3.11.7.dist-info → mdbq-3.11.9.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.11.
|
1
|
+
VERSION = '3.11.9'
|
mdbq/log/mylogger.py
CHANGED
@@ -247,7 +247,7 @@ class MyLogger:
|
|
247
247
|
if isinstance(log_data.get('message'), str):
|
248
248
|
log_data['message'] = log_data['message'].replace(field, '***')
|
249
249
|
|
250
|
-
return json.dumps(log_data, ensure_ascii=False)
|
250
|
+
return json.dumps(log_data, ensure_ascii=False, default=str)
|
251
251
|
|
252
252
|
formatter = StructuredFormatter()
|
253
253
|
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -13,6 +13,7 @@ import concurrent.futures
|
|
13
13
|
from collections import defaultdict
|
14
14
|
import sys
|
15
15
|
from datetime import datetime
|
16
|
+
import uuid
|
16
17
|
|
17
18
|
|
18
19
|
warnings.filterwarnings('ignore')
|
@@ -81,7 +82,7 @@ class MySQLDeduplicator:
|
|
81
82
|
exclude_columns: Optional[List[str]] = None,
|
82
83
|
exclude_databases: Optional[List[str]] = None,
|
83
84
|
exclude_tables: Optional[Dict[str, List[str]]] = None,
|
84
|
-
duplicate_keep_mode: str = 'keep_one'
|
85
|
+
duplicate_keep_mode: str = 'keep_one'
|
85
86
|
) -> None:
|
86
87
|
"""
|
87
88
|
初始化去重处理器
|
@@ -113,7 +114,7 @@ class MySQLDeduplicator:
|
|
113
114
|
)
|
114
115
|
|
115
116
|
# 配置参数
|
116
|
-
self.max_workers = max(1,
|
117
|
+
self.max_workers = min(max(1, max_workers), pool_size) # 限制最大线程数,不能超过连接池
|
117
118
|
self.batch_size = batch_size
|
118
119
|
self.skip_system_dbs = skip_system_dbs
|
119
120
|
self.max_retries = max_retries
|
@@ -191,7 +192,6 @@ class MySQLDeduplicator:
|
|
191
192
|
raise ConnectionError("连接池已关闭")
|
192
193
|
try:
|
193
194
|
conn = self.pool.connection()
|
194
|
-
logger.debug("成功获取数据库连接")
|
195
195
|
return conn
|
196
196
|
except Exception as e:
|
197
197
|
logger.error(f"获取数据库连接失败: {str(e)}", {'error_type': type(e).__name__})
|
@@ -269,7 +269,8 @@ class MySQLDeduplicator:
|
|
269
269
|
with conn.cursor() as cursor:
|
270
270
|
cursor.execute(f"USE `{database}`")
|
271
271
|
cursor.execute(sql)
|
272
|
-
|
272
|
+
# 严格过滤所有以'temp_'为前缀的表名(如temp_xxx、temp_xxx_dedup_...、temp_xxx_reorderid_...等)
|
273
|
+
return [row[f'Tables_in_{database}'] for row in cursor.fetchall() if not re.match(r'^temp_.*', row[f'Tables_in_{database}'])]
|
273
274
|
|
274
275
|
@_retry_on_failure
|
275
276
|
def _get_table_columns(self, database: str, table: str) -> List[str]:
|
@@ -328,48 +329,73 @@ class MySQLDeduplicator:
|
|
328
329
|
if key in self._processing_tables:
|
329
330
|
self._processing_tables.remove(key)
|
330
331
|
|
332
|
+
@_retry_on_failure
|
333
|
+
def _ensure_index(self, database: str, table: str, date_column: str) -> None:
|
334
|
+
"""
|
335
|
+
检查并为date_column自动创建索引(如果未存在)。
|
336
|
+
Args:
|
337
|
+
database (str): 数据库名。
|
338
|
+
table (str): 表名。
|
339
|
+
date_column (str): 需要检查的日期列名。
|
340
|
+
"""
|
341
|
+
with self._get_connection() as conn:
|
342
|
+
with conn.cursor() as cursor:
|
343
|
+
# 检查索引是否已存在
|
344
|
+
cursor.execute(
|
345
|
+
"""
|
346
|
+
SELECT COUNT(1) as idx_count FROM INFORMATION_SCHEMA.STATISTICS
|
347
|
+
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND COLUMN_NAME = %s
|
348
|
+
""",
|
349
|
+
(database, table, date_column)
|
350
|
+
)
|
351
|
+
idx_count = cursor.fetchone()['idx_count']
|
352
|
+
if idx_count == 0:
|
353
|
+
# 自动创建索引
|
354
|
+
index_name = f"idx_{date_column}"
|
355
|
+
safe_index_name = self._make_safe_table_name(index_name, prefix='', suffix='', max_length=64)
|
356
|
+
try:
|
357
|
+
cursor.execute(f"CREATE INDEX `{safe_index_name}` ON `{database}`.`{table}` (`{date_column}`)")
|
358
|
+
conn.commit()
|
359
|
+
logger.info('已自动为date_column创建索引', {"库": database, "表": table, "date_column": date_column, "索引名": safe_index_name})
|
360
|
+
except Exception as e:
|
361
|
+
logger.error('自动创建date_column索引失败', {"库": database, "表": table, "date_column": date_column, "异常": str(e)})
|
362
|
+
else:
|
363
|
+
logger.debug('date_column已存在索引', {"库": database, "表": table, "date_column": date_column})
|
364
|
+
|
331
365
|
def _deduplicate_table(
|
332
366
|
self,
|
333
367
|
database: str,
|
334
368
|
table: str,
|
335
369
|
columns: Optional[List[str]] = None,
|
336
370
|
dry_run: bool = False,
|
337
|
-
|
371
|
+
use_python_dedup: bool = False
|
338
372
|
) -> Tuple[int, int]:
|
339
373
|
"""
|
340
374
|
执行单表去重。
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
table (str): 表名。
|
345
|
-
columns (Optional[List[str]]): 用于去重的列名列表(为None时使用所有列)。
|
346
|
-
dry_run (bool): 是否为模拟运行(只统计不实际删除)。
|
347
|
-
reset_id (bool): 是否在去重后重排id。
|
348
|
-
Returns:
|
349
|
-
Tuple[int, int]: (重复组数, 实际删除行数)。
|
375
|
+
支持按天分批处理(如果表包含date_column),否则全表去重。
|
376
|
+
如果date_column在exclude_columns中,直接跳过该表。
|
377
|
+
优化:分批删除时用主键、避免重复建/删临时表、并发处理每天。
|
350
378
|
"""
|
351
379
|
if not self._acquire_table_lock(database, table):
|
352
380
|
return (0, 0)
|
353
381
|
temp_table = None
|
354
382
|
try:
|
355
|
-
# 获取原始数据总量
|
356
|
-
with self._get_connection() as conn:
|
357
|
-
with conn.cursor() as cursor:
|
358
|
-
logger.debug('执行SQL', {'sql': f'SELECT COUNT(*) as cnt FROM `{database}`.`{table}`'})
|
359
|
-
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`")
|
360
|
-
total_count_row = cursor.fetchone()
|
361
|
-
total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
|
362
|
-
logger.info('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name})
|
363
383
|
# 获取实际列名
|
364
384
|
all_columns = self._get_table_columns(database, table)
|
365
|
-
logger.debug('获取表列', {'库': database, '表': table, 'all_columns': all_columns})
|
366
|
-
# 检查是否需要按时间范围过滤
|
367
|
-
use_time_filter = False
|
368
|
-
time_col = self.date_column
|
369
385
|
all_columns_lower = [col.lower() for col in all_columns]
|
370
|
-
# 排除exclude_columns
|
371
386
|
exclude_columns_lower = [col.lower() for col in getattr(self, 'exclude_columns', [])]
|
372
|
-
|
387
|
+
time_col = self.date_column
|
388
|
+
time_col_lower = time_col.lower() if time_col else None
|
389
|
+
# 1. 跳过date_column在exclude_columns的情况
|
390
|
+
if time_col_lower and time_col_lower in exclude_columns_lower:
|
391
|
+
logger.warning('date_column在exclude_columns中,跳过该表', {"库": database, "表": table, "date_column": time_col, "exclude_columns": self.exclude_columns})
|
392
|
+
return (0, 0)
|
393
|
+
# 2. 判断表是否包含date_column
|
394
|
+
has_time_col = time_col_lower in all_columns_lower if time_col_lower else False
|
395
|
+
# 如果包含date_column,自动检查并创建索引
|
396
|
+
if has_time_col:
|
397
|
+
self._ensure_index(database, table, time_col)
|
398
|
+
# 3. 获取去重列
|
373
399
|
use_columns = columns or all_columns
|
374
400
|
use_columns = [col for col in use_columns if col.lower() in all_columns_lower and col.lower() not in exclude_columns_lower]
|
375
401
|
invalid_columns = set([col for col in (columns or []) if col.lower() not in all_columns_lower])
|
@@ -378,85 +404,126 @@ class MySQLDeduplicator:
|
|
378
404
|
if not use_columns:
|
379
405
|
logger.error('没有有效的去重列', {"库": database, "表": table})
|
380
406
|
return (0, 0)
|
381
|
-
# 统一用反引号包裹
|
382
|
-
column_list = ', '.join([f'`{col}`' for col in use_columns])
|
383
|
-
temp_table = f"temp_{table}_dedup_{os.getpid()}_{threading.get_ident()}"
|
384
|
-
temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)[:60]
|
385
407
|
pk = self.primary_key
|
386
|
-
# 主键判断也用小写
|
387
|
-
if pk.lower() not in all_columns_lower and pk != 'id':
|
388
|
-
logger.error('', {"不存在主键列": database, "表": table, "主键列不存在": pk})
|
389
|
-
return (0, 0)
|
390
|
-
# 找到实际主键名
|
391
408
|
pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
|
392
|
-
#
|
393
|
-
|
394
|
-
if
|
395
|
-
|
409
|
+
# 判断是否需要加日期区间条件
|
410
|
+
where_sql = ''
|
411
|
+
if has_time_col and self._dedup_start_date and self._dedup_end_date:
|
412
|
+
where_sql = f"t.`{time_col}` >= '{self._dedup_start_date}' AND t.`{time_col}` <= '{self._dedup_end_date}'"
|
413
|
+
# 获取原始数据总量(只统计区间内数据)
|
414
|
+
with self._get_connection() as conn:
|
415
|
+
with conn.cursor() as cursor:
|
416
|
+
count_where = f"WHERE `{time_col}` >= '{self._dedup_start_date}' AND `{time_col}` <= '{self._dedup_end_date}'" if has_time_col and self._dedup_start_date and self._dedup_end_date else ''
|
417
|
+
count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}` {count_where}"
|
418
|
+
logger.debug('执行SQL', {'sql': count_sql})
|
419
|
+
cursor.execute(count_sql)
|
420
|
+
total_count_row = cursor.fetchone()
|
421
|
+
total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
|
422
|
+
logger.info('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name})
|
423
|
+
column_list = ', '.join([f'`{col}`' for col in use_columns])
|
424
|
+
|
425
|
+
# 用Python查找重复
|
426
|
+
if use_python_dedup:
|
427
|
+
from collections import defaultdict
|
428
|
+
# 1. 拉取所有数据
|
429
|
+
select_cols = f'`{pk_real}`,' + ','.join([f'`{col}`' for col in use_columns])
|
430
|
+
select_where = f"WHERE `{time_col}` >= '{self._dedup_start_date}' AND `{time_col}` <= '{self._dedup_end_date}'" if has_time_col and self._dedup_start_date and self._dedup_end_date else ''
|
431
|
+
select_sql = f"SELECT {select_cols} FROM `{database}`.`{table}` {select_where}"
|
432
|
+
logger.debug('用Python查找重复,拉取数据SQL', {'sql': select_sql})
|
433
|
+
with self._get_connection() as conn:
|
434
|
+
with conn.cursor() as cursor:
|
435
|
+
cursor.execute(select_sql)
|
436
|
+
rows = cursor.fetchall()
|
437
|
+
# 2. 分组找重复
|
438
|
+
grouped = defaultdict(list)
|
439
|
+
for row in rows:
|
440
|
+
key = tuple(row[col] for col in use_columns)
|
441
|
+
grouped[key].append(row[pk_real])
|
442
|
+
# 3. 统计重复组和待删除id
|
443
|
+
dup_count = 0
|
444
|
+
del_ids = []
|
445
|
+
for ids in grouped.values():
|
446
|
+
if len(ids) > 1:
|
447
|
+
dup_count += 1
|
448
|
+
del_ids.extend(ids[1:]) # 只保留第一个
|
449
|
+
affected_rows = 0
|
450
|
+
if not dry_run and del_ids:
|
451
|
+
with self._get_connection() as conn:
|
452
|
+
with conn.cursor() as cursor:
|
453
|
+
for i in range(0, len(del_ids), self.batch_size):
|
454
|
+
batch = del_ids[i:i+self.batch_size]
|
455
|
+
del_ids_str = ','.join([str(i) for i in batch])
|
456
|
+
delete_sql = f"DELETE FROM `{database}`.`{table}` WHERE `{pk_real}` IN ({del_ids_str})"
|
457
|
+
logger.debug('用Python分批删除SQL', {'sql': delete_sql, 'ids': batch})
|
458
|
+
cursor.execute(delete_sql)
|
459
|
+
batch_deleted = cursor.rowcount
|
460
|
+
affected_rows += batch_deleted
|
461
|
+
conn.commit()
|
462
|
+
logger.info('用Python去重完成', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "去重模式": self.duplicate_keep_mode, "实际去重列": use_columns})
|
463
|
+
return (dup_count, affected_rows)
|
464
|
+
# SQL方式查找重复
|
465
|
+
temp_table = self._make_safe_table_name(table, prefix=f"temp_", suffix=f"_dedup_{os.getpid()}_{threading.get_ident()}")
|
466
|
+
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
467
|
+
# 创建临时表时加where条件
|
468
|
+
create_temp_where = f"WHERE `{time_col}` >= '{self._dedup_start_date}' AND `{time_col}` <= '{self._dedup_end_date}'" if has_time_col and self._dedup_start_date and self._dedup_end_date else ''
|
396
469
|
create_temp_sql = f"""
|
397
470
|
CREATE TABLE `{database}`.`{temp_table}` AS
|
398
471
|
SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
|
399
472
|
FROM `{database}`.`{table}`
|
400
|
-
{
|
473
|
+
{create_temp_where}
|
401
474
|
GROUP BY {column_list}
|
402
475
|
HAVING COUNT(*) > 1
|
403
476
|
"""
|
404
|
-
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
405
477
|
with self._get_connection() as conn:
|
406
478
|
with conn.cursor() as cursor:
|
407
479
|
logger.debug('创建临时表SQL', {'sql': create_temp_sql})
|
408
480
|
cursor.execute(create_temp_sql)
|
409
|
-
logger.debug('统计临时表重复组SQL', {'sql': f'SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`'})
|
410
481
|
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`")
|
411
482
|
dup_count_row = cursor.fetchone()
|
412
483
|
dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
|
413
484
|
if dup_count == 0:
|
414
|
-
logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count, "
|
415
|
-
logger.debug('删除临时表SQL', {'sql': drop_temp_sql})
|
485
|
+
logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count, "实际去重列": use_columns})
|
416
486
|
cursor.execute(drop_temp_sql)
|
417
487
|
conn.commit()
|
418
488
|
return (0, 0)
|
419
489
|
affected_rows = 0
|
420
490
|
if not dry_run:
|
421
|
-
# 分批删除,避免锁表
|
422
491
|
while True:
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
cursor.execute(delete_dup_sql)
|
492
|
+
where_clauses = []
|
493
|
+
if self.duplicate_keep_mode == 'keep_one':
|
494
|
+
where_clauses.append(f"t.`{pk_real}` <> tmp.`min_id`")
|
495
|
+
if where_sql.strip():
|
496
|
+
where_clauses.append(where_sql.strip())
|
497
|
+
where_full = "WHERE " + " AND ".join(where_clauses) if where_clauses else ""
|
498
|
+
find_dup_ids_sql = f"""
|
499
|
+
SELECT t.`{pk_real}` as del_id
|
500
|
+
FROM `{database}`.`{table}` t
|
501
|
+
JOIN `{database}`.`{temp_table}` tmp
|
502
|
+
ON {' AND '.join([f't.`{col}` <=> tmp.`{col}`' for col in use_columns])}
|
503
|
+
{where_full}
|
504
|
+
LIMIT {self.batch_size}
|
505
|
+
"""
|
506
|
+
logger.debug('查找待删除重复id SQL', {'sql': find_dup_ids_sql})
|
507
|
+
cursor.execute(find_dup_ids_sql)
|
508
|
+
del_ids = [row['del_id'] for row in cursor.fetchall()]
|
509
|
+
if not del_ids:
|
510
|
+
break
|
511
|
+
del_ids_str = ','.join([str(i) for i in del_ids])
|
512
|
+
delete_sql = f"DELETE FROM `{database}`.`{table}` WHERE `{pk_real}` IN ({del_ids_str})"
|
513
|
+
logger.debug('按id批量删除SQL', {'sql': delete_sql, 'ids': del_ids})
|
514
|
+
cursor.execute(delete_sql)
|
447
515
|
batch_deleted = cursor.rowcount
|
448
516
|
affected_rows += batch_deleted
|
449
517
|
conn.commit()
|
518
|
+
if batch_deleted == 0:
|
519
|
+
logger.warning('检测到未能删除任何数据,强制跳出循环,防止假死', {"库": database, "表": table})
|
520
|
+
break
|
450
521
|
if batch_deleted < self.batch_size:
|
451
522
|
break
|
452
|
-
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "
|
453
|
-
# 新增:去重后重排id
|
454
|
-
if reset_id and affected_rows > 0:
|
455
|
-
self._reset_id_column(database, table)
|
523
|
+
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "去重模式": self.duplicate_keep_mode, "实际去重列": use_columns})
|
456
524
|
else:
|
457
|
-
logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组数": dup_count
|
525
|
+
logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组数": dup_count})
|
458
526
|
affected_rows = 0
|
459
|
-
logger.debug('删除临时表SQL', {'sql': drop_temp_sql})
|
460
527
|
cursor.execute(drop_temp_sql)
|
461
528
|
conn.commit()
|
462
529
|
return (dup_count, affected_rows)
|
@@ -482,7 +549,8 @@ class MySQLDeduplicator:
|
|
482
549
|
table: str,
|
483
550
|
columns: Optional[List[str]] = None,
|
484
551
|
dry_run: bool = False,
|
485
|
-
|
552
|
+
reorder_id: bool = False,
|
553
|
+
use_python_dedup: bool = True
|
486
554
|
) -> Tuple[int, int]:
|
487
555
|
"""
|
488
556
|
对指定表进行去重。
|
@@ -492,7 +560,8 @@ class MySQLDeduplicator:
|
|
492
560
|
table (str): 表名。
|
493
561
|
columns (Optional[List[str]]): 用于去重的列名列表(为None时使用所有列)。
|
494
562
|
dry_run (bool): 是否为模拟运行(只统计不实际删除)。
|
495
|
-
|
563
|
+
reorder_id (bool): 去重后是否重排id。
|
564
|
+
use_python_dedup (bool): 是否用Python查找重复id。
|
496
565
|
Returns:
|
497
566
|
Tuple[int, int]: (重复组数, 实际删除行数)。
|
498
567
|
"""
|
@@ -503,9 +572,17 @@ class MySQLDeduplicator:
|
|
503
572
|
if not self._check_table_exists(database, table):
|
504
573
|
logger.warning('表不存在', {"库": database, "表": table, "warning": "跳过"})
|
505
574
|
return (0, 0)
|
506
|
-
logger.info('单表开始', {"库": database, "表": table, "参数": {"指定去重列": columns, "模拟运行": dry_run, '排除列': self.exclude_columns}})
|
507
|
-
result = self._deduplicate_table(database, table, columns, dry_run,
|
575
|
+
logger.info('单表开始', {"库": database, "表": table, "参数": {"指定去重列": columns, "模拟运行": dry_run, '排除列': self.exclude_columns, 'use_python_dedup': use_python_dedup}})
|
576
|
+
result = self._deduplicate_table(database, table, columns, dry_run, use_python_dedup)
|
508
577
|
logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result})
|
578
|
+
# 自动重排id列(仅当有实际删除时且reorder_id为True)
|
579
|
+
dup_count, affected_rows = result
|
580
|
+
if reorder_id and affected_rows > 0:
|
581
|
+
try:
|
582
|
+
reorder_ok = self.reorder_id_column(database, table, id_column=self.primary_key, dry_run=dry_run)
|
583
|
+
logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
|
584
|
+
except Exception as e:
|
585
|
+
logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
|
509
586
|
return result
|
510
587
|
except Exception as e:
|
511
588
|
logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
|
@@ -518,7 +595,8 @@ class MySQLDeduplicator:
|
|
518
595
|
columns_map: Optional[Dict[str, List[str]]] = None,
|
519
596
|
dry_run: bool = False,
|
520
597
|
parallel: bool = False,
|
521
|
-
|
598
|
+
reorder_id: bool = False,
|
599
|
+
use_python_dedup: bool = True
|
522
600
|
) -> Dict[str, Tuple[int, int]]:
|
523
601
|
"""
|
524
602
|
对指定数据库的所有表进行去重。
|
@@ -529,7 +607,8 @@ class MySQLDeduplicator:
|
|
529
607
|
columns_map (Optional[Dict[str, List[str]]]): 各表使用的去重列 {表名: [列名]}。
|
530
608
|
dry_run (bool): 是否为模拟运行。
|
531
609
|
parallel (bool): 是否并行处理。
|
532
|
-
|
610
|
+
reorder_id (bool): 去重后是否重排id。
|
611
|
+
use_python_dedup (bool): 是否用Python查找重复id。
|
533
612
|
Returns:
|
534
613
|
Dict[str, Tuple[int, int]]: {表名: (重复组数, 实际删除行数)}。
|
535
614
|
"""
|
@@ -558,7 +637,7 @@ class MySQLDeduplicator:
|
|
558
637
|
logger.debug('提交表去重任务', {'库': database, '表': table, 'columns': columns})
|
559
638
|
futures[executor.submit(
|
560
639
|
self.deduplicate_table,
|
561
|
-
database, table, columns, dry_run,
|
640
|
+
database, table, columns, dry_run, reorder_id, True
|
562
641
|
)] = table
|
563
642
|
for future in concurrent.futures.as_completed(futures):
|
564
643
|
table = futures[future]
|
@@ -574,7 +653,7 @@ class MySQLDeduplicator:
|
|
574
653
|
for table in target_tables:
|
575
654
|
columns = columns_map.get(table) if columns_map else None
|
576
655
|
dup_count, affected_rows = self.deduplicate_table(
|
577
|
-
database, table, columns, dry_run,
|
656
|
+
database, table, columns, dry_run, reorder_id, True
|
578
657
|
)
|
579
658
|
results[table] = (dup_count, affected_rows)
|
580
659
|
total_dup = sum(r[0] for r in results.values())
|
@@ -592,7 +671,8 @@ class MySQLDeduplicator:
|
|
592
671
|
columns_map: Optional[Dict[str, Dict[str, List[str]]]] = None,
|
593
672
|
dry_run: bool = False,
|
594
673
|
parallel: bool = False,
|
595
|
-
|
674
|
+
reorder_id: bool = False,
|
675
|
+
use_python_dedup: bool = True
|
596
676
|
) -> Dict[str, Dict[str, Tuple[int, int]]]:
|
597
677
|
"""
|
598
678
|
对所有数据库进行去重。
|
@@ -603,7 +683,8 @@ class MySQLDeduplicator:
|
|
603
683
|
columns_map (Optional[Dict[str, Dict[str, List[str]]]]): 指定每个表去重时使用的列,格式为 {数据库名: {表名: [列名, ...]}}。如果为 None,则使用所有列。
|
604
684
|
dry_run (bool): 是否为模拟运行模式。为 True 时只统计重复行数,不实际删除。
|
605
685
|
parallel (bool): 是否并行处理多个数据库。为 True 时使用线程池并发处理。
|
606
|
-
|
686
|
+
reorder_id (bool): 去重后是否重排id。
|
687
|
+
use_python_dedup (bool): 是否用Python查找重复id。
|
607
688
|
Returns:
|
608
689
|
Dict[str, Dict[str, Tuple[int, int]]]: 嵌套字典,格式为 {数据库名: {表名: (重复组数, 实际删除行数)}}。
|
609
690
|
"""
|
@@ -615,7 +696,7 @@ class MySQLDeduplicator:
|
|
615
696
|
if not target_dbs:
|
616
697
|
logger.warning('没有可处理的数据库')
|
617
698
|
return all_results
|
618
|
-
logger.info('全局开始', {"数据库数量": len(target_dbs), "数据库列表": target_dbs, "参数": {"模拟运行": dry_run, "并行处理": parallel, '排除列': self.exclude_columns}})
|
699
|
+
logger.info('全局开始', {"数据库数量": len(target_dbs), "数据库列表": target_dbs, "参数": {"模拟运行": dry_run, "并行处理": parallel, '排除列': self.exclude_columns, 'use_python_dedup': use_python_dedup}})
|
619
700
|
if parallel and self.max_workers > 1:
|
620
701
|
# 使用线程池并行处理多个数据库
|
621
702
|
with concurrent.futures.ThreadPoolExecutor(
|
@@ -627,7 +708,7 @@ class MySQLDeduplicator:
|
|
627
708
|
db_columns_map = columns_map.get(db) if columns_map else None
|
628
709
|
futures[executor.submit(
|
629
710
|
self.deduplicate_database,
|
630
|
-
db, tables, db_columns_map, dry_run, False,
|
711
|
+
db, tables, db_columns_map, dry_run, False, reorder_id, True
|
631
712
|
)] = db
|
632
713
|
for future in concurrent.futures.as_completed(futures):
|
633
714
|
db = futures[future]
|
@@ -643,7 +724,7 @@ class MySQLDeduplicator:
|
|
643
724
|
tables = tables_map.get(db) if tables_map else None
|
644
725
|
db_columns_map = columns_map.get(db) if columns_map else None
|
645
726
|
db_results = self.deduplicate_database(
|
646
|
-
db, tables, db_columns_map, dry_run, parallel,
|
727
|
+
db, tables, db_columns_map, dry_run, parallel, reorder_id, True
|
647
728
|
)
|
648
729
|
all_results[db] = db_results
|
649
730
|
total_dup = sum(
|
@@ -738,145 +819,206 @@ class MySQLDeduplicator:
|
|
738
819
|
"""
|
739
820
|
self.close()
|
740
821
|
|
741
|
-
def
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
f'CREATE TABLE `{temp_table}`',
|
783
|
-
create_sql,
|
784
|
-
count=1
|
785
|
-
)
|
786
|
-
create_sql_temp = replace_id_type(create_sql_temp)
|
787
|
-
create_sql_temp = re.sub(r'AUTO_INCREMENT=\d+', '', create_sql_temp)
|
788
|
-
return create_sql_temp
|
789
|
-
|
790
|
-
def _create_and_fill_temp_table(self, database: str, table: str, temp_table: str, pk: str) -> list:
|
791
|
-
"""创建临时表并插入重排id数据,返回所有字段名。"""
|
792
|
-
with self._get_connection() as conn:
|
793
|
-
with conn.cursor() as cursor:
|
794
|
-
cursor.execute(f"USE `{database}`")
|
795
|
-
cursor.execute(f"SHOW COLUMNS FROM `{database}`.`{table}`")
|
796
|
-
columns = [row['Field'] for row in cursor.fetchall()]
|
797
|
-
columns_wo_id = [col for col in columns if col != pk]
|
798
|
-
col_list = ', '.join([f'`{col}`' for col in columns_wo_id])
|
799
|
-
insert_sql = f"INSERT INTO `{database}`.`{temp_table}` ({col_list}, `{pk}`) SELECT {col_list}, (@rownum:=@rownum+1) as `{pk}` FROM `{database}`.`{table}` JOIN (SELECT @rownum:=0) r ORDER BY `{pk}` ASC"
|
800
|
-
cursor.execute(insert_sql)
|
801
|
-
return columns
|
802
|
-
|
803
|
-
def _swap_tables_with_backup(self, database: str, table: str, temp_table: str, bak_table: str):
|
804
|
-
"""原表重命名为备份,临时表变原表名。"""
|
805
|
-
with self._get_connection() as conn:
|
806
|
-
with conn.cursor() as cursor:
|
807
|
-
cursor.execute(f"USE `{database}`")
|
808
|
-
cursor.execute(f"RENAME TABLE `{database}`.`{table}` TO `{database}`.`{bak_table}`")
|
809
|
-
cursor.execute(f"RENAME TABLE `{database}`.`{temp_table}` TO `{database}`.`{table}`")
|
810
|
-
conn.commit()
|
811
|
-
|
812
|
-
def _check_and_cleanup_backup(self, database: str, table: str, bak_table: str) -> bool:
|
813
|
-
"""校验新表和备份表数据量一致,安全删除备份表。"""
|
814
|
-
with self._get_connection() as conn:
|
815
|
-
with conn.cursor() as cursor:
|
816
|
-
cursor.execute(f"USE `{database}`")
|
817
|
-
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`")
|
818
|
-
new_cnt = cursor.fetchone()['cnt']
|
819
|
-
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{bak_table}`")
|
820
|
-
old_cnt = cursor.fetchone()['cnt']
|
821
|
-
if new_cnt == old_cnt:
|
822
|
-
cursor.execute(f"DROP TABLE `{database}`.`{bak_table}`")
|
823
|
-
conn.commit()
|
824
|
-
return True
|
825
|
-
else:
|
826
|
-
logger.error('id重排后数据量不一致,未删除备份表', {'库': database, '表': table, '新表行数': new_cnt, '备份表行数': old_cnt})
|
827
|
-
return False
|
828
|
-
|
829
|
-
def _rollback_table_swap(self, database: str, table: str, bak_table: str):
|
830
|
-
"""回滚:如bak表存在且原表不存在,则恢复原表名。"""
|
822
|
+
def reorder_id_column(
|
823
|
+
self,
|
824
|
+
database: str,
|
825
|
+
table: Optional[str] = None,
|
826
|
+
id_column: str = "id",
|
827
|
+
dry_run: bool = False,
|
828
|
+
auto_drop_backup: bool = True
|
829
|
+
) -> Any:
|
830
|
+
"""
|
831
|
+
安全重排指定表或指定库下所有表的id列为顺序自增(1,2,3...)。
|
832
|
+
Args:
|
833
|
+
database (str): 数据库名
|
834
|
+
table (Optional[str]): 表名,None时批量处理该库所有表
|
835
|
+
id_column (str): id列名,默认"id"
|
836
|
+
dry_run (bool): 是否为模拟运行
|
837
|
+
auto_drop_backup (bool): 校验通过后自动删除备份表
|
838
|
+
Returns:
|
839
|
+
bool 或 dict: 单表时bool,批量时{表名: bool}
|
840
|
+
"""
|
841
|
+
if not table:
|
842
|
+
# 批量模式,对库下所有表执行
|
843
|
+
try:
|
844
|
+
all_tables = self._get_tables(database)
|
845
|
+
except Exception as e:
|
846
|
+
logger.error('获取库下所有表失败', {"库": database, "异常": str(e)})
|
847
|
+
return {}
|
848
|
+
results = {}
|
849
|
+
for tbl in all_tables:
|
850
|
+
try:
|
851
|
+
res = self.reorder_id_column(database, tbl, id_column, dry_run, auto_drop_backup)
|
852
|
+
results[tbl] = res
|
853
|
+
except Exception as e:
|
854
|
+
logger.error('批量id重排异常', {"库": database, "表": tbl, "异常": str(e)})
|
855
|
+
results[tbl] = False
|
856
|
+
logger.info('批量id重排完成', {"库": database, "结果": results})
|
857
|
+
return results
|
858
|
+
# 单表模式
|
859
|
+
table_quoted = f"`{database}`.`{table}`"
|
860
|
+
if not self._acquire_table_lock(database, table):
|
861
|
+
logger.warning('表级锁获取失败,跳过id重排', {"库": database, "表": table})
|
862
|
+
return False
|
831
863
|
try:
|
864
|
+
# 检查表是否存在
|
865
|
+
if not self._check_table_exists(database, table):
|
866
|
+
logger.warning('表不存在,跳过id重排', {"库": database, "表": table})
|
867
|
+
return False
|
868
|
+
# 检查id列是否存在
|
832
869
|
with self._get_connection() as conn:
|
833
870
|
with conn.cursor() as cursor:
|
834
|
-
cursor.execute(
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
def _reset_id_column(self, database: str, table: str) -> bool:
|
846
|
-
pk = self.primary_key
|
847
|
-
temp_table = f"temp_{table}_resetid_{os.getpid()}_{threading.get_ident()}"
|
848
|
-
temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)[:60]
|
849
|
-
bak_table = f"{table}_bak_{int(time.time())}"
|
850
|
-
try:
|
851
|
-
# 1. 检查外键依赖
|
852
|
-
if self._has_foreign_key_dependency(database, table, pk):
|
853
|
-
logger.warning('存在外键依赖,拒绝重排id', {'库': database, '表': table})
|
871
|
+
cursor.execute("""
|
872
|
+
SELECT COLUMN_NAME, COLUMN_KEY
|
873
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
874
|
+
WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s
|
875
|
+
""", (database, table))
|
876
|
+
columns_info = cursor.fetchall()
|
877
|
+
columns = [row['COLUMN_NAME'] for row in columns_info]
|
878
|
+
id_is_pk = any(row['COLUMN_NAME'].lower() == id_column.lower() and row['COLUMN_KEY'] in ('PRI', 'UNI') for row in columns_info)
|
879
|
+
if id_column not in columns:
|
880
|
+
logger.warning('表无id列,跳过id重排', {"库": database, "表": table})
|
854
881
|
return False
|
855
|
-
#
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
882
|
+
# 检查主键是否为单列id
|
883
|
+
pk_cols = [row['COLUMN_NAME'] for row in columns_info if row['COLUMN_KEY'] == 'PRI']
|
884
|
+
if len(pk_cols) != 1 or pk_cols[0].lower() != id_column.lower():
|
885
|
+
logger.warning('主键不是单列id,跳过id重排', {"库": database, "表": table, "主键列": pk_cols})
|
886
|
+
return False
|
887
|
+
# 检查外键约束
|
860
888
|
with self._get_connection() as conn:
|
861
889
|
with conn.cursor() as cursor:
|
862
|
-
cursor.execute(
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
890
|
+
cursor.execute("""
|
891
|
+
SELECT * FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
|
892
|
+
WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s AND REFERENCED_TABLE_NAME IS NOT NULL
|
893
|
+
""", (database, table))
|
894
|
+
if cursor.fetchone():
|
895
|
+
logger.warning('表存在外键约束,跳过id重排', {"库": database, "表": table})
|
896
|
+
return False
|
897
|
+
# 获取表结构
|
898
|
+
with self._get_connection() as conn:
|
899
|
+
with conn.cursor() as cursor:
|
900
|
+
cursor.execute(f"SHOW CREATE TABLE {table_quoted}")
|
901
|
+
create_table_sql = cursor.fetchone()['Create Table']
|
902
|
+
logger.info('开始id重排', {"库": database, "表": table, "重排列": id_column, "试运行": dry_run, "DDL警告": "MySQL DDL操作不可回滚,建议提前备份!"})
|
903
|
+
if dry_run:
|
904
|
+
logger.info('dry_run模式,打印原表结构', {"库": database, "表": table, "建表语句": create_table_sql})
|
873
905
|
return True
|
874
|
-
|
906
|
+
temp_table = self._make_safe_table_name(table, prefix=f"temp_", suffix=f"_reorderid_{os.getpid()}_{threading.get_ident()}")
|
907
|
+
temp_table_quoted = f"`{database}`.`{temp_table}`"
|
908
|
+
backup_table = self._make_safe_table_name(table, prefix="backup_", suffix=f"_{int(time.time())}_{uuid.uuid4().hex[:8]}")
|
909
|
+
backup_table_quoted = f"`{database}`.`{backup_table}`"
|
910
|
+
try:
|
911
|
+
with self._get_connection() as conn:
|
912
|
+
with conn.cursor() as cursor:
|
913
|
+
# 1. 创建临时表,结构同原表
|
914
|
+
try:
|
915
|
+
cursor.execute(f"CREATE TABLE {temp_table_quoted} LIKE {table_quoted}")
|
916
|
+
except Exception as e:
|
917
|
+
logger.error('创建临时表失败', {"库": database, "表": table, "异常": str(e)})
|
918
|
+
return False
|
919
|
+
# 2. 插入数据,id列用ROW_NUMBER重排(MySQL 8+)
|
920
|
+
all_cols = ','.join([f'`{col}`' for col in columns])
|
921
|
+
all_cols_noid = ','.join([f'`{col}`' for col in columns if col != id_column])
|
922
|
+
insert_sql = f"""
|
923
|
+
INSERT INTO {temp_table_quoted} ({all_cols})
|
924
|
+
SELECT ROW_NUMBER() OVER (ORDER BY `{id_column}` ASC) as `{id_column}`, {all_cols_noid}
|
925
|
+
FROM {table_quoted}
|
926
|
+
"""
|
927
|
+
try:
|
928
|
+
cursor.execute(insert_sql)
|
929
|
+
except Exception as e:
|
930
|
+
logger.error('插入重排数据失败', {"库": database, "表": table, "异常": str(e)})
|
931
|
+
try:
|
932
|
+
cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
|
933
|
+
except Exception as drop_e:
|
934
|
+
logger.error('插入失败后删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
935
|
+
return False
|
936
|
+
# 如果id不是主键,尝试加主键(如不冲突)
|
937
|
+
if not id_is_pk:
|
938
|
+
try:
|
939
|
+
cursor.execute(f"ALTER TABLE {temp_table_quoted} ADD PRIMARY KEY(`{id_column}`)")
|
940
|
+
except Exception as e:
|
941
|
+
logger.warning('id列加主键失败,可能已存在其他主键', {"库": database, "表": table, "异常": str(e)})
|
942
|
+
# 3. 原表重命名为备份,临时表重命名为正式表
|
943
|
+
try:
|
944
|
+
cursor.execute(f"RENAME TABLE {table_quoted} TO {backup_table_quoted}, {temp_table_quoted} TO {table_quoted}")
|
945
|
+
except Exception as e:
|
946
|
+
logger.error('RENAME TABLE失败', {"库": database, "表": table, "异常": str(e)})
|
947
|
+
# 回滚:删除临时表
|
948
|
+
try:
|
949
|
+
cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
|
950
|
+
except Exception as drop_e:
|
951
|
+
logger.error('RENAME失败后删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
952
|
+
return False
|
953
|
+
# 4. 校验新表和备份表数据量一致
|
954
|
+
try:
|
955
|
+
cursor.execute(f"SELECT COUNT(*) as cnt FROM {table_quoted}")
|
956
|
+
new_cnt = cursor.fetchone()['cnt']
|
957
|
+
cursor.execute(f"SELECT COUNT(*) as cnt FROM {backup_table_quoted}")
|
958
|
+
old_cnt = cursor.fetchone()['cnt']
|
959
|
+
except Exception as e:
|
960
|
+
logger.error('校验数据量失败', {"库": database, "表": table, "异常": str(e)})
|
961
|
+
return False
|
962
|
+
if new_cnt != old_cnt:
|
963
|
+
logger.error('id重排后数据量不一致,自动回滚', {"库": database, "表": table, "新表": new_cnt, "备份表": old_cnt})
|
964
|
+
# 回滚:恢复原表
|
965
|
+
try:
|
966
|
+
cursor.execute(f"DROP TABLE {table_quoted}")
|
967
|
+
cursor.execute(f"RENAME TABLE {backup_table_quoted} TO {table_quoted}")
|
968
|
+
except Exception as e:
|
969
|
+
logger.error('回滚恢复原表失败', {"库": database, "表": table, "异常": str(e)})
|
970
|
+
return False
|
971
|
+
logger.info('id重排成功且数据量一致', {"库": database, "表": table, "新表": new_cnt, "备份表": old_cnt, "备份表名": backup_table})
|
972
|
+
# 5. 可选:自动删除备份表
|
973
|
+
if auto_drop_backup:
|
974
|
+
try:
|
975
|
+
cursor.execute(f"DROP TABLE {backup_table_quoted}")
|
976
|
+
logger.info('已自动删除备份表', {"库": database, "表": table, "备份表名": backup_table})
|
977
|
+
except Exception as e:
|
978
|
+
logger.error('自动删除备份表失败', {"库": database, "表": table, "异常": str(e)})
|
979
|
+
return True
|
980
|
+
except Exception as e:
|
981
|
+
logger.error('id重排异常,准备回滚', {"库": database, "表": table, "异常": str(e)})
|
982
|
+
# 回滚:如临时表存在则删掉,恢复原表结构
|
983
|
+
with self._get_connection() as conn:
|
984
|
+
with conn.cursor() as cursor:
|
985
|
+
try:
|
986
|
+
cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
|
987
|
+
except Exception as drop_e:
|
988
|
+
logger.error('回滚时删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
989
|
+
# 恢复原表(如备份表存在)
|
990
|
+
try:
|
991
|
+
with self._get_connection() as conn2:
|
992
|
+
with conn2.cursor() as cursor2:
|
993
|
+
if self._check_table_exists(database, backup_table):
|
994
|
+
cursor2.execute(f"DROP TABLE IF EXISTS {table_quoted}")
|
995
|
+
cursor2.execute(f"RENAME TABLE {backup_table_quoted} TO {table_quoted}")
|
996
|
+
logger.info('已自动恢复原表', {"库": database, "表": table, "备份表名": backup_table})
|
997
|
+
except Exception as recover_e:
|
998
|
+
logger.error('回滚时恢复原表失败', {"库": database, "表": table, "异常": str(recover_e)})
|
875
999
|
return False
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
1000
|
+
finally:
|
1001
|
+
self._release_table_lock(database, table)
|
1002
|
+
|
1003
|
+
@staticmethod
|
1004
|
+
def _make_safe_table_name(base: str, prefix: str = '', suffix: str = '', max_length: int = 64) -> str:
|
1005
|
+
"""
|
1006
|
+
生成安全的MySQL表名,确保总长度不超过max_length字节。
|
1007
|
+
:param base: 原始表名
|
1008
|
+
:param prefix: 前缀
|
1009
|
+
:param suffix: 后缀
|
1010
|
+
:param max_length: 最大长度,默认64
|
1011
|
+
:return: 安全表名
|
1012
|
+
"""
|
1013
|
+
# 只允许字母数字下划线
|
1014
|
+
base = re.sub(r'[^a-zA-Z0-9_]', '_', base)
|
1015
|
+
prefix = re.sub(r'[^a-zA-Z0-9_]', '_', prefix)
|
1016
|
+
suffix = re.sub(r'[^a-zA-Z0-9_]', '_', suffix)
|
1017
|
+
remain = max_length - len(prefix) - len(suffix)
|
1018
|
+
if remain < 1:
|
1019
|
+
# 前后缀太长,直接截断
|
1020
|
+
return (prefix + suffix)[:max_length]
|
1021
|
+
return f"{prefix}{base[:remain]}{suffix}"[:max_length]
|
880
1022
|
|
881
1023
|
|
882
1024
|
def main():
|
@@ -884,21 +1026,26 @@ def main():
|
|
884
1026
|
username='root',
|
885
1027
|
password='pwd',
|
886
1028
|
host='localhost',
|
887
|
-
port=3306
|
1029
|
+
port=3306,
|
1030
|
+
date_range=['2025-05-27', '2025-05-28'],
|
1031
|
+
exclude_tables={'推广数据2': ['地域报表_城市_2025_05_copy1', '主体报表_2025_copy1']}
|
888
1032
|
)
|
889
1033
|
|
890
1034
|
# 全库去重(单线程)
|
891
|
-
deduplicator.deduplicate_all(dry_run=False, parallel=
|
1035
|
+
deduplicator.deduplicate_all(dry_run=False, parallel=True, reorder_id=True)
|
892
1036
|
|
893
1037
|
# # 指定数据库去重(多线程)
|
894
|
-
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=
|
1038
|
+
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=False, reorder_id=False)
|
895
1039
|
|
896
1040
|
# # 指定表去重(使用特定列)
|
897
|
-
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False,
|
1041
|
+
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False, reorder_id=False)
|
1042
|
+
|
1043
|
+
# # 重排id列
|
1044
|
+
# deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
|
898
1045
|
|
899
1046
|
# 关闭连接
|
900
1047
|
deduplicator.close()
|
901
1048
|
|
902
1049
|
if __name__ == '__main__':
|
903
|
-
|
1050
|
+
main()
|
904
1051
|
pass
|
mdbq/mysql/uploader.py
CHANGED
@@ -428,6 +428,7 @@ class MySQLUploader:
|
|
428
428
|
if idx_col in set_typ:
|
429
429
|
safe_idx_col = self._validate_identifier(idx_col)
|
430
430
|
index_defs.append(f"INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)")
|
431
|
+
index_defs = list(set(index_defs))
|
431
432
|
index_sql = (',' + ','.join(index_defs)) if index_defs else ''
|
432
433
|
sql = f"""
|
433
434
|
CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
|
@@ -593,6 +594,34 @@ class MySQLUploader:
|
|
593
594
|
logger.error('无法获取表列信息', {'库': db_name, '表': table_name, '错误': str(e)})
|
594
595
|
raise
|
595
596
|
|
597
|
+
def _ensure_index(self, db_name: str, table_name: str, column: str):
|
598
|
+
"""
|
599
|
+
确保某列有索引,如果没有则创建。
|
600
|
+
"""
|
601
|
+
db_name = self._validate_identifier(db_name)
|
602
|
+
table_name = self._validate_identifier(table_name)
|
603
|
+
column = self._validate_identifier(column)
|
604
|
+
# 检查索引是否已存在
|
605
|
+
sql_check = '''
|
606
|
+
SELECT COUNT(1) FROM INFORMATION_SCHEMA.STATISTICS
|
607
|
+
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND COLUMN_NAME = %s
|
608
|
+
'''
|
609
|
+
sql_create = f'ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{column}` (`{column}`)'
|
610
|
+
try:
|
611
|
+
with self._get_connection() as conn:
|
612
|
+
with conn.cursor() as cursor:
|
613
|
+
cursor.execute(sql_check, (db_name, table_name, column))
|
614
|
+
exists = cursor.fetchone()
|
615
|
+
if exists and list(exists.values())[0] > 0:
|
616
|
+
logger.debug('索引已存在', {'库': db_name, '表': table_name, '列': column})
|
617
|
+
return
|
618
|
+
cursor.execute(sql_create)
|
619
|
+
conn.commit()
|
620
|
+
logger.info('已为列创建索引', {'库': db_name, '表': table_name, '列': column})
|
621
|
+
except Exception as e:
|
622
|
+
logger.error('创建索引失败', {'库': db_name, '表': table_name, '列': column, '错误': str(e)})
|
623
|
+
raise
|
624
|
+
|
596
625
|
def _upload_to_table(
|
597
626
|
self,
|
598
627
|
db_name: str,
|
@@ -646,6 +675,13 @@ class MySQLUploader:
|
|
646
675
|
})
|
647
676
|
raise ValueError(f"列不存在: `{col}` -> `{db_name}`.`{table_name}`")
|
648
677
|
|
678
|
+
# 确保分表参考字段为索引
|
679
|
+
if date_column and date_column in table_columns:
|
680
|
+
try:
|
681
|
+
self._ensure_index(db_name, table_name, date_column)
|
682
|
+
except Exception as e:
|
683
|
+
logger.warning('分表参考字段索引创建失败', {'库': db_name, '表': table_name, '列': date_column, '错误': str(e)})
|
684
|
+
|
649
685
|
# 插入数据
|
650
686
|
self._insert_data(
|
651
687
|
db_name, table_name, data, set_typ,
|
@@ -868,7 +904,7 @@ class MySQLUploader:
|
|
868
904
|
:param duplicate_columns: 用于检查重复的列,可选
|
869
905
|
:param allow_null: 是否允许空值,默认为False
|
870
906
|
:param partition_by: 分表方式('year'、'month'、'None'),可选
|
871
|
-
:param partition_date_column: 用于分表的日期列名,默认为'日期'
|
907
|
+
:param partition_date_column: 用于分表的日期列名,默认为'日期', 默认会添加为索引
|
872
908
|
:param auto_create: 表不存在时是否自动创建,默认为True
|
873
909
|
:param indexes: 需要创建索引的列列表,可选
|
874
910
|
:param update_on_duplicate: 遇到重复数据时是否更新旧数据,默认为False
|
@@ -977,6 +1013,12 @@ class MySQLUploader:
|
|
977
1013
|
allow_null, auto_create, partition_date_column,
|
978
1014
|
indexes, batch_id, update_on_duplicate, transaction_mode
|
979
1015
|
)
|
1016
|
+
# 确保分表参考字段为索引
|
1017
|
+
if partition_date_column in filtered_set_typ:
|
1018
|
+
try:
|
1019
|
+
self._ensure_index(db_name, part_table, partition_date_column)
|
1020
|
+
except Exception as e:
|
1021
|
+
logger.warning('分表参考字段索引创建失败', {'库': db_name, '表': part_table, '列': partition_date_column, '错误': str(e)})
|
980
1022
|
except Exception as e:
|
981
1023
|
logger.error('分表上传异常', {
|
982
1024
|
'库': db_name,
|
@@ -995,6 +1037,12 @@ class MySQLUploader:
|
|
995
1037
|
allow_null, auto_create, partition_date_column,
|
996
1038
|
indexes, batch_id, update_on_duplicate, transaction_mode
|
997
1039
|
)
|
1040
|
+
# 确保分表参考字段为索引
|
1041
|
+
if partition_date_column in filtered_set_typ:
|
1042
|
+
try:
|
1043
|
+
self._ensure_index(db_name, table_name, partition_date_column)
|
1044
|
+
except Exception as e:
|
1045
|
+
logger.warning('分表参考字段索引创建失败', {'库': db_name, '表': table_name, '列': partition_date_column, '错误': str(e)})
|
998
1046
|
|
999
1047
|
success_flag = True
|
1000
1048
|
|
@@ -1,17 +1,17 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=PDdrWyCY8MR3t82c_RzSF6lAB6oCcZdWveXkX7AvIIQ,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
6
6
|
mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
|
7
7
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
8
|
-
mdbq/log/mylogger.py,sha256=
|
8
|
+
mdbq/log/mylogger.py,sha256=Crw6LwVo3I3IUbzIETu8f46Quza3CTCh-qYf4edbBPo,24139
|
9
9
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
|
-
mdbq/mysql/deduplicator.py,sha256=
|
11
|
+
mdbq/mysql/deduplicator.py,sha256=G7hdIO6rDLBNo1jSm6PbmPAzzfdN2jZFP4BnLhO02Mo,52970
|
12
12
|
mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
|
13
13
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
14
|
-
mdbq/mysql/uploader.py,sha256=
|
14
|
+
mdbq/mysql/uploader.py,sha256=8Px_W2bYOr1wQgMXMK0DggNiuE6a6Ul4BlJake8LSo8,64469
|
15
15
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
16
16
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
17
17
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
24
24
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
25
25
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
26
26
|
mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
|
27
|
-
mdbq-3.11.
|
28
|
-
mdbq-3.11.
|
29
|
-
mdbq-3.11.
|
30
|
-
mdbq-3.11.
|
27
|
+
mdbq-3.11.9.dist-info/METADATA,sha256=djSbJHNSHuyh2So6ia5CluTggpZ4REj9jxhO9vwOeKw,364
|
28
|
+
mdbq-3.11.9.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-3.11.9.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-3.11.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|