mdbq 3.11.6__py3-none-any.whl → 3.11.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/deduplicator.py +237 -168
- {mdbq-3.11.6.dist-info → mdbq-3.11.8.dist-info}/METADATA +1 -1
- {mdbq-3.11.6.dist-info → mdbq-3.11.8.dist-info}/RECORD +6 -6
- {mdbq-3.11.6.dist-info → mdbq-3.11.8.dist-info}/WHEEL +0 -0
- {mdbq-3.11.6.dist-info → mdbq-3.11.8.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.11.
|
1
|
+
VERSION = '3.11.8'
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -13,6 +13,7 @@ import concurrent.futures
|
|
13
13
|
from collections import defaultdict
|
14
14
|
import sys
|
15
15
|
from datetime import datetime
|
16
|
+
import uuid
|
16
17
|
|
17
18
|
|
18
19
|
warnings.filterwarnings('ignore')
|
@@ -80,7 +81,8 @@ class MySQLDeduplicator:
|
|
80
81
|
date_column: str = '日期',
|
81
82
|
exclude_columns: Optional[List[str]] = None,
|
82
83
|
exclude_databases: Optional[List[str]] = None,
|
83
|
-
exclude_tables: Optional[Dict[str, List[str]]] = None
|
84
|
+
exclude_tables: Optional[Dict[str, List[str]]] = None,
|
85
|
+
duplicate_keep_mode: str = 'keep_one'
|
84
86
|
) -> None:
|
85
87
|
"""
|
86
88
|
初始化去重处理器
|
@@ -90,6 +92,7 @@ class MySQLDeduplicator:
|
|
90
92
|
:param exclude_columns: 去重时排除的列名列表,默认为['id', '更新时间']
|
91
93
|
:param exclude_databases: 排除的数据库名列表
|
92
94
|
:param exclude_tables: 排除的表名字典 {数据库名: [表名, ...]}
|
95
|
+
:param duplicate_keep_mode: 'keep_one'(默认,重复组保留一条),'remove_all'(全部删除重复组)
|
93
96
|
"""
|
94
97
|
# 连接池状态标志
|
95
98
|
self._closed = False
|
@@ -173,6 +176,8 @@ class MySQLDeduplicator:
|
|
173
176
|
self.exclude_databases = set([db.lower() for db in exclude_databases]) if exclude_databases else set()
|
174
177
|
self.exclude_tables = {k.lower(): set([t.lower() for t in v]) for k, v in (exclude_tables or {}).items()}
|
175
178
|
|
179
|
+
self.duplicate_keep_mode = duplicate_keep_mode if duplicate_keep_mode in ('keep_one', 'remove_all') else 'keep_one'
|
180
|
+
|
176
181
|
def _get_connection(self) -> pymysql.connections.Connection:
|
177
182
|
"""
|
178
183
|
从连接池获取一个数据库连接。
|
@@ -187,7 +192,6 @@ class MySQLDeduplicator:
|
|
187
192
|
raise ConnectionError("连接池已关闭")
|
188
193
|
try:
|
189
194
|
conn = self.pool.connection()
|
190
|
-
logger.debug("成功获取数据库连接")
|
191
195
|
return conn
|
192
196
|
except Exception as e:
|
193
197
|
logger.error(f"获取数据库连接失败: {str(e)}", {'error_type': type(e).__name__})
|
@@ -329,8 +333,7 @@ class MySQLDeduplicator:
|
|
329
333
|
database: str,
|
330
334
|
table: str,
|
331
335
|
columns: Optional[List[str]] = None,
|
332
|
-
dry_run: bool = False
|
333
|
-
reset_id: bool = False
|
336
|
+
dry_run: bool = False
|
334
337
|
) -> Tuple[int, int]:
|
335
338
|
"""
|
336
339
|
执行单表去重。
|
@@ -340,7 +343,6 @@ class MySQLDeduplicator:
|
|
340
343
|
table (str): 表名。
|
341
344
|
columns (Optional[List[str]]): 用于去重的列名列表(为None时使用所有列)。
|
342
345
|
dry_run (bool): 是否为模拟运行(只统计不实际删除)。
|
343
|
-
reset_id (bool): 是否在去重后重排id。
|
344
346
|
Returns:
|
345
347
|
Tuple[int, int]: (重复组数, 实际删除行数)。
|
346
348
|
"""
|
@@ -376,8 +378,7 @@ class MySQLDeduplicator:
|
|
376
378
|
return (0, 0)
|
377
379
|
# 统一用反引号包裹
|
378
380
|
column_list = ', '.join([f'`{col}`' for col in use_columns])
|
379
|
-
temp_table = f"temp_
|
380
|
-
temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)[:60]
|
381
|
+
temp_table = self._make_safe_table_name(table, prefix=f"temp_", suffix=f"_dedup_{os.getpid()}_{threading.get_ident()}")
|
381
382
|
pk = self.primary_key
|
382
383
|
# 主键判断也用小写
|
383
384
|
if pk.lower() not in all_columns_lower and pk != 'id':
|
@@ -416,14 +417,28 @@ class MySQLDeduplicator:
|
|
416
417
|
if not dry_run:
|
417
418
|
# 分批删除,避免锁表
|
418
419
|
while True:
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
420
|
+
if self.duplicate_keep_mode == 'remove_all':
|
421
|
+
# 删除所有重复组的所有记录
|
422
|
+
delete_dup_sql = f"""
|
423
|
+
DELETE FROM `{database}`.`{table}`
|
424
|
+
WHERE ({', '.join([f'`{col}`' for col in use_columns])}) IN (
|
425
|
+
SELECT {column_list} FROM `{database}`.`{temp_table}`
|
426
|
+
) {'AND' if use_time_filter else ''} {f'`{time_col}` >= \'{self._dedup_start_date}\' AND `{time_col}` <= \'{self._dedup_end_date}\'' if use_time_filter else ''}
|
427
|
+
LIMIT {self.batch_size}
|
428
|
+
"""
|
429
|
+
else:
|
430
|
+
# 修正:只删除重复组中不是min_id的行,唯一数据不动
|
431
|
+
delete_dup_sql = f"""
|
432
|
+
DELETE FROM `{database}`.`{table}` t
|
433
|
+
WHERE EXISTS (
|
434
|
+
SELECT 1 FROM `{database}`.`{temp_table}` tmp
|
435
|
+
WHERE
|
436
|
+
{' AND '.join([f't.`{col}` <=> tmp.`{col}`' for col in use_columns])}
|
437
|
+
AND t.`{pk_real}` <> tmp.`min_id`
|
438
|
+
)
|
439
|
+
{'AND' if use_time_filter else ''} {f't.`{time_col}` >= \'{self._dedup_start_date}\' AND t.`{time_col}` <= \'{self._dedup_end_date}\'' if use_time_filter else ''}
|
440
|
+
LIMIT {self.batch_size}
|
441
|
+
"""
|
427
442
|
logger.debug('执行删除重复数据SQL', {'sql': delete_dup_sql})
|
428
443
|
cursor.execute(delete_dup_sql)
|
429
444
|
batch_deleted = cursor.rowcount
|
@@ -431,10 +446,7 @@ class MySQLDeduplicator:
|
|
431
446
|
conn.commit()
|
432
447
|
if batch_deleted < self.batch_size:
|
433
448
|
break
|
434
|
-
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns})
|
435
|
-
# 新增:去重后重排id
|
436
|
-
if reset_id and affected_rows > 0:
|
437
|
-
self._reset_id_column(database, table)
|
449
|
+
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns, "去重模式": self.duplicate_keep_mode})
|
438
450
|
else:
|
439
451
|
logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组数": dup_count, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None})
|
440
452
|
affected_rows = 0
|
@@ -463,8 +475,7 @@ class MySQLDeduplicator:
|
|
463
475
|
database: str,
|
464
476
|
table: str,
|
465
477
|
columns: Optional[List[str]] = None,
|
466
|
-
dry_run: bool = False
|
467
|
-
reset_id: bool = False
|
478
|
+
dry_run: bool = False
|
468
479
|
) -> Tuple[int, int]:
|
469
480
|
"""
|
470
481
|
对指定表进行去重。
|
@@ -474,7 +485,6 @@ class MySQLDeduplicator:
|
|
474
485
|
table (str): 表名。
|
475
486
|
columns (Optional[List[str]]): 用于去重的列名列表(为None时使用所有列)。
|
476
487
|
dry_run (bool): 是否为模拟运行(只统计不实际删除)。
|
477
|
-
reset_id (bool): 是否在去重后重排id。
|
478
488
|
Returns:
|
479
489
|
Tuple[int, int]: (重复组数, 实际删除行数)。
|
480
490
|
"""
|
@@ -486,7 +496,7 @@ class MySQLDeduplicator:
|
|
486
496
|
logger.warning('表不存在', {"库": database, "表": table, "warning": "跳过"})
|
487
497
|
return (0, 0)
|
488
498
|
logger.info('单表开始', {"库": database, "表": table, "参数": {"指定去重列": columns, "模拟运行": dry_run, '排除列': self.exclude_columns}})
|
489
|
-
result = self._deduplicate_table(database, table, columns, dry_run
|
499
|
+
result = self._deduplicate_table(database, table, columns, dry_run)
|
490
500
|
logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result})
|
491
501
|
return result
|
492
502
|
except Exception as e:
|
@@ -499,8 +509,7 @@ class MySQLDeduplicator:
|
|
499
509
|
tables: Optional[List[str]] = None,
|
500
510
|
columns_map: Optional[Dict[str, List[str]]] = None,
|
501
511
|
dry_run: bool = False,
|
502
|
-
parallel: bool = False
|
503
|
-
reset_id: bool = False
|
512
|
+
parallel: bool = False
|
504
513
|
) -> Dict[str, Tuple[int, int]]:
|
505
514
|
"""
|
506
515
|
对指定数据库的所有表进行去重。
|
@@ -511,7 +520,6 @@ class MySQLDeduplicator:
|
|
511
520
|
columns_map (Optional[Dict[str, List[str]]]): 各表使用的去重列 {表名: [列名]}。
|
512
521
|
dry_run (bool): 是否为模拟运行。
|
513
522
|
parallel (bool): 是否并行处理。
|
514
|
-
reset_id (bool): 是否在去重后重排id。
|
515
523
|
Returns:
|
516
524
|
Dict[str, Tuple[int, int]]: {表名: (重复组数, 实际删除行数)}。
|
517
525
|
"""
|
@@ -540,7 +548,7 @@ class MySQLDeduplicator:
|
|
540
548
|
logger.debug('提交表去重任务', {'库': database, '表': table, 'columns': columns})
|
541
549
|
futures[executor.submit(
|
542
550
|
self.deduplicate_table,
|
543
|
-
database, table, columns, dry_run
|
551
|
+
database, table, columns, dry_run
|
544
552
|
)] = table
|
545
553
|
for future in concurrent.futures.as_completed(futures):
|
546
554
|
table = futures[future]
|
@@ -556,7 +564,7 @@ class MySQLDeduplicator:
|
|
556
564
|
for table in target_tables:
|
557
565
|
columns = columns_map.get(table) if columns_map else None
|
558
566
|
dup_count, affected_rows = self.deduplicate_table(
|
559
|
-
database, table, columns, dry_run
|
567
|
+
database, table, columns, dry_run
|
560
568
|
)
|
561
569
|
results[table] = (dup_count, affected_rows)
|
562
570
|
total_dup = sum(r[0] for r in results.values())
|
@@ -573,8 +581,7 @@ class MySQLDeduplicator:
|
|
573
581
|
tables_map: Optional[Dict[str, List[str]]] = None,
|
574
582
|
columns_map: Optional[Dict[str, Dict[str, List[str]]]] = None,
|
575
583
|
dry_run: bool = False,
|
576
|
-
parallel: bool = False
|
577
|
-
reset_id: bool = False
|
584
|
+
parallel: bool = False
|
578
585
|
) -> Dict[str, Dict[str, Tuple[int, int]]]:
|
579
586
|
"""
|
580
587
|
对所有数据库进行去重。
|
@@ -585,7 +592,6 @@ class MySQLDeduplicator:
|
|
585
592
|
columns_map (Optional[Dict[str, Dict[str, List[str]]]]): 指定每个表去重时使用的列,格式为 {数据库名: {表名: [列名, ...]}}。如果为 None,则使用所有列。
|
586
593
|
dry_run (bool): 是否为模拟运行模式。为 True 时只统计重复行数,不实际删除。
|
587
594
|
parallel (bool): 是否并行处理多个数据库。为 True 时使用线程池并发处理。
|
588
|
-
reset_id (bool): 是否在去重后重排id。
|
589
595
|
Returns:
|
590
596
|
Dict[str, Dict[str, Tuple[int, int]]]: 嵌套字典,格式为 {数据库名: {表名: (重复组数, 实际删除行数)}}。
|
591
597
|
"""
|
@@ -609,7 +615,7 @@ class MySQLDeduplicator:
|
|
609
615
|
db_columns_map = columns_map.get(db) if columns_map else None
|
610
616
|
futures[executor.submit(
|
611
617
|
self.deduplicate_database,
|
612
|
-
db, tables, db_columns_map, dry_run, False
|
618
|
+
db, tables, db_columns_map, dry_run, False
|
613
619
|
)] = db
|
614
620
|
for future in concurrent.futures.as_completed(futures):
|
615
621
|
db = futures[future]
|
@@ -625,7 +631,7 @@ class MySQLDeduplicator:
|
|
625
631
|
tables = tables_map.get(db) if tables_map else None
|
626
632
|
db_columns_map = columns_map.get(db) if columns_map else None
|
627
633
|
db_results = self.deduplicate_database(
|
628
|
-
db, tables, db_columns_map, dry_run, parallel
|
634
|
+
db, tables, db_columns_map, dry_run, parallel
|
629
635
|
)
|
630
636
|
all_results[db] = db_results
|
631
637
|
total_dup = sum(
|
@@ -720,145 +726,206 @@ class MySQLDeduplicator:
|
|
720
726
|
"""
|
721
727
|
self.close()
|
722
728
|
|
723
|
-
def
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
f'CREATE TABLE `{temp_table}`',
|
765
|
-
create_sql,
|
766
|
-
count=1
|
767
|
-
)
|
768
|
-
create_sql_temp = replace_id_type(create_sql_temp)
|
769
|
-
create_sql_temp = re.sub(r'AUTO_INCREMENT=\d+', '', create_sql_temp)
|
770
|
-
return create_sql_temp
|
771
|
-
|
772
|
-
def _create_and_fill_temp_table(self, database: str, table: str, temp_table: str, pk: str) -> list:
|
773
|
-
"""创建临时表并插入重排id数据,返回所有字段名。"""
|
774
|
-
with self._get_connection() as conn:
|
775
|
-
with conn.cursor() as cursor:
|
776
|
-
cursor.execute(f"USE `{database}`")
|
777
|
-
cursor.execute(f"SHOW COLUMNS FROM `{database}`.`{table}`")
|
778
|
-
columns = [row['Field'] for row in cursor.fetchall()]
|
779
|
-
columns_wo_id = [col for col in columns if col != pk]
|
780
|
-
col_list = ', '.join([f'`{col}`' for col in columns_wo_id])
|
781
|
-
insert_sql = f"INSERT INTO `{database}`.`{temp_table}` ({col_list}, `{pk}`) SELECT {col_list}, (@rownum:=@rownum+1) as `{pk}` FROM `{database}`.`{table}` JOIN (SELECT @rownum:=0) r ORDER BY `{pk}` ASC"
|
782
|
-
cursor.execute(insert_sql)
|
783
|
-
return columns
|
784
|
-
|
785
|
-
def _swap_tables_with_backup(self, database: str, table: str, temp_table: str, bak_table: str):
|
786
|
-
"""原表重命名为备份,临时表变原表名。"""
|
787
|
-
with self._get_connection() as conn:
|
788
|
-
with conn.cursor() as cursor:
|
789
|
-
cursor.execute(f"USE `{database}`")
|
790
|
-
cursor.execute(f"RENAME TABLE `{database}`.`{table}` TO `{database}`.`{bak_table}`")
|
791
|
-
cursor.execute(f"RENAME TABLE `{database}`.`{temp_table}` TO `{database}`.`{table}`")
|
792
|
-
conn.commit()
|
793
|
-
|
794
|
-
def _check_and_cleanup_backup(self, database: str, table: str, bak_table: str) -> bool:
|
795
|
-
"""校验新表和备份表数据量一致,安全删除备份表。"""
|
796
|
-
with self._get_connection() as conn:
|
797
|
-
with conn.cursor() as cursor:
|
798
|
-
cursor.execute(f"USE `{database}`")
|
799
|
-
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`")
|
800
|
-
new_cnt = cursor.fetchone()['cnt']
|
801
|
-
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{bak_table}`")
|
802
|
-
old_cnt = cursor.fetchone()['cnt']
|
803
|
-
if new_cnt == old_cnt:
|
804
|
-
cursor.execute(f"DROP TABLE `{database}`.`{bak_table}`")
|
805
|
-
conn.commit()
|
806
|
-
return True
|
807
|
-
else:
|
808
|
-
logger.error('id重排后数据量不一致,未删除备份表', {'库': database, '表': table, '新表行数': new_cnt, '备份表行数': old_cnt})
|
809
|
-
return False
|
810
|
-
|
811
|
-
def _rollback_table_swap(self, database: str, table: str, bak_table: str):
|
812
|
-
"""回滚:如bak表存在且原表不存在,则恢复原表名。"""
|
729
|
+
def reorder_id_column(
|
730
|
+
self,
|
731
|
+
database: str,
|
732
|
+
table: Optional[str] = None,
|
733
|
+
id_column: str = "id",
|
734
|
+
dry_run: bool = False,
|
735
|
+
auto_drop_backup: bool = True
|
736
|
+
) -> Any:
|
737
|
+
"""
|
738
|
+
安全重排指定表或指定库下所有表的id列为顺序自增(1,2,3...)。
|
739
|
+
Args:
|
740
|
+
database (str): 数据库名
|
741
|
+
table (Optional[str]): 表名,None时批量处理该库所有表
|
742
|
+
id_column (str): id列名,默认"id"
|
743
|
+
dry_run (bool): 是否为模拟运行
|
744
|
+
auto_drop_backup (bool): 校验通过后自动删除备份表
|
745
|
+
Returns:
|
746
|
+
bool 或 dict: 单表时bool,批量时{表名: bool}
|
747
|
+
"""
|
748
|
+
if not table:
|
749
|
+
# 批量模式,对库下所有表执行
|
750
|
+
try:
|
751
|
+
all_tables = self._get_tables(database)
|
752
|
+
except Exception as e:
|
753
|
+
logger.error('获取库下所有表失败', {"库": database, "异常": str(e)})
|
754
|
+
return {}
|
755
|
+
results = {}
|
756
|
+
for tbl in all_tables:
|
757
|
+
try:
|
758
|
+
res = self.reorder_id_column(database, tbl, id_column, dry_run, auto_drop_backup)
|
759
|
+
results[tbl] = res
|
760
|
+
except Exception as e:
|
761
|
+
logger.error('批量id重排异常', {"库": database, "表": tbl, "异常": str(e)})
|
762
|
+
results[tbl] = False
|
763
|
+
logger.info('批量id重排完成', {"库": database, "结果": results})
|
764
|
+
return results
|
765
|
+
# 单表模式
|
766
|
+
table_quoted = f"`{database}`.`{table}`"
|
767
|
+
if not self._acquire_table_lock(database, table):
|
768
|
+
logger.warning('表级锁获取失败,跳过id重排', {"库": database, "表": table})
|
769
|
+
return False
|
813
770
|
try:
|
771
|
+
# 检查表是否存在
|
772
|
+
if not self._check_table_exists(database, table):
|
773
|
+
logger.warning('表不存在,跳过id重排', {"库": database, "表": table})
|
774
|
+
return False
|
775
|
+
# 检查id列是否存在
|
814
776
|
with self._get_connection() as conn:
|
815
777
|
with conn.cursor() as cursor:
|
816
|
-
cursor.execute(
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
bak_table = f"{table}_bak_{int(time.time())}"
|
832
|
-
try:
|
833
|
-
# 1. 检查外键依赖
|
834
|
-
if self._has_foreign_key_dependency(database, table, pk):
|
835
|
-
logger.warning('存在外键依赖,拒绝重排id', {'库': database, '表': table})
|
778
|
+
cursor.execute("""
|
779
|
+
SELECT COLUMN_NAME, COLUMN_KEY
|
780
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
781
|
+
WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s
|
782
|
+
""", (database, table))
|
783
|
+
columns_info = cursor.fetchall()
|
784
|
+
columns = [row['COLUMN_NAME'] for row in columns_info]
|
785
|
+
id_is_pk = any(row['COLUMN_NAME'].lower() == id_column.lower() and row['COLUMN_KEY'] in ('PRI', 'UNI') for row in columns_info)
|
786
|
+
if id_column not in columns:
|
787
|
+
logger.warning('表无id列,跳过id重排', {"库": database, "表": table})
|
788
|
+
return False
|
789
|
+
# 检查主键是否为单列id
|
790
|
+
pk_cols = [row['COLUMN_NAME'] for row in columns_info if row['COLUMN_KEY'] == 'PRI']
|
791
|
+
if len(pk_cols) != 1 or pk_cols[0].lower() != id_column.lower():
|
792
|
+
logger.warning('主键不是单列id,跳过id重排', {"库": database, "表": table, "主键列": pk_cols})
|
836
793
|
return False
|
837
|
-
#
|
838
|
-
create_sql, pk_columns = self._get_table_create_sql_and_pk(database, table)
|
839
|
-
# 3. 生成临时表DDL
|
840
|
-
create_sql_temp = self._make_temp_table_sql(create_sql, table, temp_table, pk, pk_columns)
|
841
|
-
# 4. 创建临时表
|
794
|
+
# 检查外键约束
|
842
795
|
with self._get_connection() as conn:
|
843
796
|
with conn.cursor() as cursor:
|
844
|
-
cursor.execute(
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
797
|
+
cursor.execute("""
|
798
|
+
SELECT * FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
|
799
|
+
WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s AND REFERENCED_TABLE_NAME IS NOT NULL
|
800
|
+
""", (database, table))
|
801
|
+
if cursor.fetchone():
|
802
|
+
logger.warning('表存在外键约束,跳过id重排', {"库": database, "表": table})
|
803
|
+
return False
|
804
|
+
# 获取表结构
|
805
|
+
with self._get_connection() as conn:
|
806
|
+
with conn.cursor() as cursor:
|
807
|
+
cursor.execute(f"SHOW CREATE TABLE {table_quoted}")
|
808
|
+
create_table_sql = cursor.fetchone()['Create Table']
|
809
|
+
logger.info('开始id重排', {"库": database, "表": table, "重排列": id_column, "dry_run": dry_run, "DDL警告": "MySQL DDL操作不可回滚,建议提前备份!"})
|
810
|
+
if dry_run:
|
811
|
+
logger.info('dry_run模式,打印原表结构', {"库": database, "表": table, "建表语句": create_table_sql})
|
855
812
|
return True
|
856
|
-
|
813
|
+
temp_table = self._make_safe_table_name(table, prefix=f"temp_", suffix=f"_reorderid_{os.getpid()}_{threading.get_ident()}")
|
814
|
+
temp_table_quoted = f"`{database}`.`{temp_table}`"
|
815
|
+
backup_table = self._make_safe_table_name(table, prefix="backup_", suffix=f"_{int(time.time())}_{uuid.uuid4().hex[:8]}")
|
816
|
+
backup_table_quoted = f"`{database}`.`{backup_table}`"
|
817
|
+
try:
|
818
|
+
with self._get_connection() as conn:
|
819
|
+
with conn.cursor() as cursor:
|
820
|
+
# 1. 创建临时表,结构同原表
|
821
|
+
try:
|
822
|
+
cursor.execute(f"CREATE TABLE {temp_table_quoted} LIKE {table_quoted}")
|
823
|
+
except Exception as e:
|
824
|
+
logger.error('创建临时表失败', {"库": database, "表": table, "异常": str(e)})
|
825
|
+
return False
|
826
|
+
# 2. 插入数据,id列用ROW_NUMBER重排(MySQL 8+)
|
827
|
+
all_cols = ','.join([f'`{col}`' for col in columns])
|
828
|
+
all_cols_noid = ','.join([f'`{col}`' for col in columns if col != id_column])
|
829
|
+
insert_sql = f"""
|
830
|
+
INSERT INTO {temp_table_quoted} ({all_cols})
|
831
|
+
SELECT ROW_NUMBER() OVER (ORDER BY `{id_column}` ASC) as `{id_column}`, {all_cols_noid}
|
832
|
+
FROM {table_quoted}
|
833
|
+
"""
|
834
|
+
try:
|
835
|
+
cursor.execute(insert_sql)
|
836
|
+
except Exception as e:
|
837
|
+
logger.error('插入重排数据失败', {"库": database, "表": table, "异常": str(e)})
|
838
|
+
try:
|
839
|
+
cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
|
840
|
+
except Exception as drop_e:
|
841
|
+
logger.error('插入失败后删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
842
|
+
return False
|
843
|
+
# 如果id不是主键,尝试加主键(如不冲突)
|
844
|
+
if not id_is_pk:
|
845
|
+
try:
|
846
|
+
cursor.execute(f"ALTER TABLE {temp_table_quoted} ADD PRIMARY KEY(`{id_column}`)")
|
847
|
+
except Exception as e:
|
848
|
+
logger.warning('id列加主键失败,可能已存在其他主键', {"库": database, "表": table, "异常": str(e)})
|
849
|
+
# 3. 原表重命名为备份,临时表重命名为正式表
|
850
|
+
try:
|
851
|
+
cursor.execute(f"RENAME TABLE {table_quoted} TO {backup_table_quoted}, {temp_table_quoted} TO {table_quoted}")
|
852
|
+
except Exception as e:
|
853
|
+
logger.error('RENAME TABLE失败', {"库": database, "表": table, "异常": str(e)})
|
854
|
+
# 回滚:删除临时表
|
855
|
+
try:
|
856
|
+
cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
|
857
|
+
except Exception as drop_e:
|
858
|
+
logger.error('RENAME失败后删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
859
|
+
return False
|
860
|
+
# 4. 校验新表和备份表数据量一致
|
861
|
+
try:
|
862
|
+
cursor.execute(f"SELECT COUNT(*) as cnt FROM {table_quoted}")
|
863
|
+
new_cnt = cursor.fetchone()['cnt']
|
864
|
+
cursor.execute(f"SELECT COUNT(*) as cnt FROM {backup_table_quoted}")
|
865
|
+
old_cnt = cursor.fetchone()['cnt']
|
866
|
+
except Exception as e:
|
867
|
+
logger.error('校验数据量失败', {"库": database, "表": table, "异常": str(e)})
|
868
|
+
return False
|
869
|
+
if new_cnt != old_cnt:
|
870
|
+
logger.error('id重排后数据量不一致,自动回滚', {"库": database, "表": table, "新表": new_cnt, "备份表": old_cnt})
|
871
|
+
# 回滚:恢复原表
|
872
|
+
try:
|
873
|
+
cursor.execute(f"DROP TABLE {table_quoted}")
|
874
|
+
cursor.execute(f"RENAME TABLE {backup_table_quoted} TO {table_quoted}")
|
875
|
+
except Exception as e:
|
876
|
+
logger.error('回滚恢复原表失败', {"库": database, "表": table, "异常": str(e)})
|
877
|
+
return False
|
878
|
+
logger.info('id重排成功且数据量一致', {"库": database, "表": table, "新表": new_cnt, "备份表": old_cnt, "备份表名": backup_table})
|
879
|
+
# 5. 可选:自动删除备份表
|
880
|
+
if auto_drop_backup:
|
881
|
+
try:
|
882
|
+
cursor.execute(f"DROP TABLE {backup_table_quoted}")
|
883
|
+
logger.info('已自动删除备份表', {"库": database, "表": table, "备份表名": backup_table})
|
884
|
+
except Exception as e:
|
885
|
+
logger.error('自动删除备份表失败', {"库": database, "表": table, "异常": str(e)})
|
886
|
+
return True
|
887
|
+
except Exception as e:
|
888
|
+
logger.error('id重排异常,准备回滚', {"库": database, "表": table, "异常": str(e)})
|
889
|
+
# 回滚:如临时表存在则删掉,恢复原表结构
|
890
|
+
with self._get_connection() as conn:
|
891
|
+
with conn.cursor() as cursor:
|
892
|
+
try:
|
893
|
+
cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
|
894
|
+
except Exception as drop_e:
|
895
|
+
logger.error('回滚时删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
896
|
+
# 恢复原表(如备份表存在)
|
897
|
+
try:
|
898
|
+
with self._get_connection() as conn2:
|
899
|
+
with conn2.cursor() as cursor2:
|
900
|
+
if self._check_table_exists(database, backup_table):
|
901
|
+
cursor2.execute(f"DROP TABLE IF EXISTS {table_quoted}")
|
902
|
+
cursor2.execute(f"RENAME TABLE {backup_table_quoted} TO {table_quoted}")
|
903
|
+
logger.info('已自动恢复原表', {"库": database, "表": table, "备份表名": backup_table})
|
904
|
+
except Exception as recover_e:
|
905
|
+
logger.error('回滚时恢复原表失败', {"库": database, "表": table, "异常": str(recover_e)})
|
857
906
|
return False
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
907
|
+
finally:
|
908
|
+
self._release_table_lock(database, table)
|
909
|
+
|
910
|
+
@staticmethod
|
911
|
+
def _make_safe_table_name(base: str, prefix: str = '', suffix: str = '', max_length: int = 64) -> str:
|
912
|
+
"""
|
913
|
+
生成安全的MySQL表名,确保总长度不超过max_length字节。
|
914
|
+
:param base: 原始表名
|
915
|
+
:param prefix: 前缀
|
916
|
+
:param suffix: 后缀
|
917
|
+
:param max_length: 最大长度,默认64
|
918
|
+
:return: 安全表名
|
919
|
+
"""
|
920
|
+
# 只允许字母数字下划线
|
921
|
+
base = re.sub(r'[^a-zA-Z0-9_]', '_', base)
|
922
|
+
prefix = re.sub(r'[^a-zA-Z0-9_]', '_', prefix)
|
923
|
+
suffix = re.sub(r'[^a-zA-Z0-9_]', '_', suffix)
|
924
|
+
remain = max_length - len(prefix) - len(suffix)
|
925
|
+
if remain < 1:
|
926
|
+
# 前后缀太长,直接截断
|
927
|
+
return (prefix + suffix)[:max_length]
|
928
|
+
return f"{prefix}{base[:remain]}{suffix}"[:max_length]
|
862
929
|
|
863
930
|
|
864
931
|
def main():
|
@@ -870,14 +937,16 @@ def main():
|
|
870
937
|
)
|
871
938
|
|
872
939
|
# 全库去重(单线程)
|
873
|
-
deduplicator.deduplicate_all(dry_run=False, parallel=
|
940
|
+
deduplicator.deduplicate_all(dry_run=False, parallel=True)
|
874
941
|
|
875
942
|
# # 指定数据库去重(多线程)
|
876
|
-
#
|
877
|
-
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True, reset_id=False)
|
943
|
+
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=False)
|
878
944
|
|
879
945
|
# # 指定表去重(使用特定列)
|
880
|
-
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False
|
946
|
+
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False)
|
947
|
+
|
948
|
+
# # 重排id列
|
949
|
+
# deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
|
881
950
|
|
882
951
|
# 关闭连接
|
883
952
|
deduplicator.close()
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=JqV56ilza72jpkf_fztVtAdeSmcdPr0BmGGo9FFjGrA,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -8,7 +8,7 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
|
8
8
|
mdbq/log/mylogger.py,sha256=HuxLBCXjm6fZrxYE0rdpUCz359WGeqOX0vvg9jTuRY4,24126
|
9
9
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
|
-
mdbq/mysql/deduplicator.py,sha256=
|
11
|
+
mdbq/mysql/deduplicator.py,sha256=Znmjn4sI1Mj2koSPTDojFwg_1MTgk3GZTFZyhSRwn7s,46746
|
12
12
|
mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
|
13
13
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
14
14
|
mdbq/mysql/uploader.py,sha256=LxPlAfSNhQbLu-or4wxa-vLjCw5_PIN3ZVoksWUJazQ,61701
|
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
24
24
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
25
25
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
26
26
|
mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
|
27
|
-
mdbq-3.11.
|
28
|
-
mdbq-3.11.
|
29
|
-
mdbq-3.11.
|
30
|
-
mdbq-3.11.
|
27
|
+
mdbq-3.11.8.dist-info/METADATA,sha256=EJtaHsIzWmcB9hTRg1NZeDd55Zez0lu6FPD_ZQB9nMw,364
|
28
|
+
mdbq-3.11.8.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-3.11.8.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-3.11.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|