mdbq 3.11.7__py3-none-any.whl → 3.11.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/deduplicator.py +211 -159
- {mdbq-3.11.7.dist-info → mdbq-3.11.8.dist-info}/METADATA +1 -1
- {mdbq-3.11.7.dist-info → mdbq-3.11.8.dist-info}/RECORD +6 -6
- {mdbq-3.11.7.dist-info → mdbq-3.11.8.dist-info}/WHEEL +0 -0
- {mdbq-3.11.7.dist-info → mdbq-3.11.8.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.11.
|
1
|
+
VERSION = '3.11.8'
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -13,6 +13,7 @@ import concurrent.futures
|
|
13
13
|
from collections import defaultdict
|
14
14
|
import sys
|
15
15
|
from datetime import datetime
|
16
|
+
import uuid
|
16
17
|
|
17
18
|
|
18
19
|
warnings.filterwarnings('ignore')
|
@@ -81,7 +82,7 @@ class MySQLDeduplicator:
|
|
81
82
|
exclude_columns: Optional[List[str]] = None,
|
82
83
|
exclude_databases: Optional[List[str]] = None,
|
83
84
|
exclude_tables: Optional[Dict[str, List[str]]] = None,
|
84
|
-
duplicate_keep_mode: str = 'keep_one'
|
85
|
+
duplicate_keep_mode: str = 'keep_one'
|
85
86
|
) -> None:
|
86
87
|
"""
|
87
88
|
初始化去重处理器
|
@@ -191,7 +192,6 @@ class MySQLDeduplicator:
|
|
191
192
|
raise ConnectionError("连接池已关闭")
|
192
193
|
try:
|
193
194
|
conn = self.pool.connection()
|
194
|
-
logger.debug("成功获取数据库连接")
|
195
195
|
return conn
|
196
196
|
except Exception as e:
|
197
197
|
logger.error(f"获取数据库连接失败: {str(e)}", {'error_type': type(e).__name__})
|
@@ -333,8 +333,7 @@ class MySQLDeduplicator:
|
|
333
333
|
database: str,
|
334
334
|
table: str,
|
335
335
|
columns: Optional[List[str]] = None,
|
336
|
-
dry_run: bool = False
|
337
|
-
reset_id: bool = False
|
336
|
+
dry_run: bool = False
|
338
337
|
) -> Tuple[int, int]:
|
339
338
|
"""
|
340
339
|
执行单表去重。
|
@@ -344,7 +343,6 @@ class MySQLDeduplicator:
|
|
344
343
|
table (str): 表名。
|
345
344
|
columns (Optional[List[str]]): 用于去重的列名列表(为None时使用所有列)。
|
346
345
|
dry_run (bool): 是否为模拟运行(只统计不实际删除)。
|
347
|
-
reset_id (bool): 是否在去重后重排id。
|
348
346
|
Returns:
|
349
347
|
Tuple[int, int]: (重复组数, 实际删除行数)。
|
350
348
|
"""
|
@@ -380,8 +378,7 @@ class MySQLDeduplicator:
|
|
380
378
|
return (0, 0)
|
381
379
|
# 统一用反引号包裹
|
382
380
|
column_list = ', '.join([f'`{col}`' for col in use_columns])
|
383
|
-
temp_table = f"temp_
|
384
|
-
temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)[:60]
|
381
|
+
temp_table = self._make_safe_table_name(table, prefix=f"temp_", suffix=f"_dedup_{os.getpid()}_{threading.get_ident()}")
|
385
382
|
pk = self.primary_key
|
386
383
|
# 主键判断也用小写
|
387
384
|
if pk.lower() not in all_columns_lower and pk != 'id':
|
@@ -450,9 +447,6 @@ class MySQLDeduplicator:
|
|
450
447
|
if batch_deleted < self.batch_size:
|
451
448
|
break
|
452
449
|
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns, "去重模式": self.duplicate_keep_mode})
|
453
|
-
# 新增:去重后重排id
|
454
|
-
if reset_id and affected_rows > 0:
|
455
|
-
self._reset_id_column(database, table)
|
456
450
|
else:
|
457
451
|
logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组数": dup_count, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None})
|
458
452
|
affected_rows = 0
|
@@ -481,8 +475,7 @@ class MySQLDeduplicator:
|
|
481
475
|
database: str,
|
482
476
|
table: str,
|
483
477
|
columns: Optional[List[str]] = None,
|
484
|
-
dry_run: bool = False
|
485
|
-
reset_id: bool = False
|
478
|
+
dry_run: bool = False
|
486
479
|
) -> Tuple[int, int]:
|
487
480
|
"""
|
488
481
|
对指定表进行去重。
|
@@ -492,7 +485,6 @@ class MySQLDeduplicator:
|
|
492
485
|
table (str): 表名。
|
493
486
|
columns (Optional[List[str]]): 用于去重的列名列表(为None时使用所有列)。
|
494
487
|
dry_run (bool): 是否为模拟运行(只统计不实际删除)。
|
495
|
-
reset_id (bool): 是否在去重后重排id。
|
496
488
|
Returns:
|
497
489
|
Tuple[int, int]: (重复组数, 实际删除行数)。
|
498
490
|
"""
|
@@ -504,7 +496,7 @@ class MySQLDeduplicator:
|
|
504
496
|
logger.warning('表不存在', {"库": database, "表": table, "warning": "跳过"})
|
505
497
|
return (0, 0)
|
506
498
|
logger.info('单表开始', {"库": database, "表": table, "参数": {"指定去重列": columns, "模拟运行": dry_run, '排除列': self.exclude_columns}})
|
507
|
-
result = self._deduplicate_table(database, table, columns, dry_run
|
499
|
+
result = self._deduplicate_table(database, table, columns, dry_run)
|
508
500
|
logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result})
|
509
501
|
return result
|
510
502
|
except Exception as e:
|
@@ -517,8 +509,7 @@ class MySQLDeduplicator:
|
|
517
509
|
tables: Optional[List[str]] = None,
|
518
510
|
columns_map: Optional[Dict[str, List[str]]] = None,
|
519
511
|
dry_run: bool = False,
|
520
|
-
parallel: bool = False
|
521
|
-
reset_id: bool = False
|
512
|
+
parallel: bool = False
|
522
513
|
) -> Dict[str, Tuple[int, int]]:
|
523
514
|
"""
|
524
515
|
对指定数据库的所有表进行去重。
|
@@ -529,7 +520,6 @@ class MySQLDeduplicator:
|
|
529
520
|
columns_map (Optional[Dict[str, List[str]]]): 各表使用的去重列 {表名: [列名]}。
|
530
521
|
dry_run (bool): 是否为模拟运行。
|
531
522
|
parallel (bool): 是否并行处理。
|
532
|
-
reset_id (bool): 是否在去重后重排id。
|
533
523
|
Returns:
|
534
524
|
Dict[str, Tuple[int, int]]: {表名: (重复组数, 实际删除行数)}。
|
535
525
|
"""
|
@@ -558,7 +548,7 @@ class MySQLDeduplicator:
|
|
558
548
|
logger.debug('提交表去重任务', {'库': database, '表': table, 'columns': columns})
|
559
549
|
futures[executor.submit(
|
560
550
|
self.deduplicate_table,
|
561
|
-
database, table, columns, dry_run
|
551
|
+
database, table, columns, dry_run
|
562
552
|
)] = table
|
563
553
|
for future in concurrent.futures.as_completed(futures):
|
564
554
|
table = futures[future]
|
@@ -574,7 +564,7 @@ class MySQLDeduplicator:
|
|
574
564
|
for table in target_tables:
|
575
565
|
columns = columns_map.get(table) if columns_map else None
|
576
566
|
dup_count, affected_rows = self.deduplicate_table(
|
577
|
-
database, table, columns, dry_run
|
567
|
+
database, table, columns, dry_run
|
578
568
|
)
|
579
569
|
results[table] = (dup_count, affected_rows)
|
580
570
|
total_dup = sum(r[0] for r in results.values())
|
@@ -591,8 +581,7 @@ class MySQLDeduplicator:
|
|
591
581
|
tables_map: Optional[Dict[str, List[str]]] = None,
|
592
582
|
columns_map: Optional[Dict[str, Dict[str, List[str]]]] = None,
|
593
583
|
dry_run: bool = False,
|
594
|
-
parallel: bool = False
|
595
|
-
reset_id: bool = False
|
584
|
+
parallel: bool = False
|
596
585
|
) -> Dict[str, Dict[str, Tuple[int, int]]]:
|
597
586
|
"""
|
598
587
|
对所有数据库进行去重。
|
@@ -603,7 +592,6 @@ class MySQLDeduplicator:
|
|
603
592
|
columns_map (Optional[Dict[str, Dict[str, List[str]]]]): 指定每个表去重时使用的列,格式为 {数据库名: {表名: [列名, ...]}}。如果为 None,则使用所有列。
|
604
593
|
dry_run (bool): 是否为模拟运行模式。为 True 时只统计重复行数,不实际删除。
|
605
594
|
parallel (bool): 是否并行处理多个数据库。为 True 时使用线程池并发处理。
|
606
|
-
reset_id (bool): 是否在去重后重排id。
|
607
595
|
Returns:
|
608
596
|
Dict[str, Dict[str, Tuple[int, int]]]: 嵌套字典,格式为 {数据库名: {表名: (重复组数, 实际删除行数)}}。
|
609
597
|
"""
|
@@ -627,7 +615,7 @@ class MySQLDeduplicator:
|
|
627
615
|
db_columns_map = columns_map.get(db) if columns_map else None
|
628
616
|
futures[executor.submit(
|
629
617
|
self.deduplicate_database,
|
630
|
-
db, tables, db_columns_map, dry_run, False
|
618
|
+
db, tables, db_columns_map, dry_run, False
|
631
619
|
)] = db
|
632
620
|
for future in concurrent.futures.as_completed(futures):
|
633
621
|
db = futures[future]
|
@@ -643,7 +631,7 @@ class MySQLDeduplicator:
|
|
643
631
|
tables = tables_map.get(db) if tables_map else None
|
644
632
|
db_columns_map = columns_map.get(db) if columns_map else None
|
645
633
|
db_results = self.deduplicate_database(
|
646
|
-
db, tables, db_columns_map, dry_run, parallel
|
634
|
+
db, tables, db_columns_map, dry_run, parallel
|
647
635
|
)
|
648
636
|
all_results[db] = db_results
|
649
637
|
total_dup = sum(
|
@@ -738,145 +726,206 @@ class MySQLDeduplicator:
|
|
738
726
|
"""
|
739
727
|
self.close()
|
740
728
|
|
741
|
-
def
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
f'CREATE TABLE `{temp_table}`',
|
783
|
-
create_sql,
|
784
|
-
count=1
|
785
|
-
)
|
786
|
-
create_sql_temp = replace_id_type(create_sql_temp)
|
787
|
-
create_sql_temp = re.sub(r'AUTO_INCREMENT=\d+', '', create_sql_temp)
|
788
|
-
return create_sql_temp
|
789
|
-
|
790
|
-
def _create_and_fill_temp_table(self, database: str, table: str, temp_table: str, pk: str) -> list:
|
791
|
-
"""创建临时表并插入重排id数据,返回所有字段名。"""
|
792
|
-
with self._get_connection() as conn:
|
793
|
-
with conn.cursor() as cursor:
|
794
|
-
cursor.execute(f"USE `{database}`")
|
795
|
-
cursor.execute(f"SHOW COLUMNS FROM `{database}`.`{table}`")
|
796
|
-
columns = [row['Field'] for row in cursor.fetchall()]
|
797
|
-
columns_wo_id = [col for col in columns if col != pk]
|
798
|
-
col_list = ', '.join([f'`{col}`' for col in columns_wo_id])
|
799
|
-
insert_sql = f"INSERT INTO `{database}`.`{temp_table}` ({col_list}, `{pk}`) SELECT {col_list}, (@rownum:=@rownum+1) as `{pk}` FROM `{database}`.`{table}` JOIN (SELECT @rownum:=0) r ORDER BY `{pk}` ASC"
|
800
|
-
cursor.execute(insert_sql)
|
801
|
-
return columns
|
802
|
-
|
803
|
-
def _swap_tables_with_backup(self, database: str, table: str, temp_table: str, bak_table: str):
|
804
|
-
"""原表重命名为备份,临时表变原表名。"""
|
805
|
-
with self._get_connection() as conn:
|
806
|
-
with conn.cursor() as cursor:
|
807
|
-
cursor.execute(f"USE `{database}`")
|
808
|
-
cursor.execute(f"RENAME TABLE `{database}`.`{table}` TO `{database}`.`{bak_table}`")
|
809
|
-
cursor.execute(f"RENAME TABLE `{database}`.`{temp_table}` TO `{database}`.`{table}`")
|
810
|
-
conn.commit()
|
811
|
-
|
812
|
-
def _check_and_cleanup_backup(self, database: str, table: str, bak_table: str) -> bool:
|
813
|
-
"""校验新表和备份表数据量一致,安全删除备份表。"""
|
814
|
-
with self._get_connection() as conn:
|
815
|
-
with conn.cursor() as cursor:
|
816
|
-
cursor.execute(f"USE `{database}`")
|
817
|
-
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`")
|
818
|
-
new_cnt = cursor.fetchone()['cnt']
|
819
|
-
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{bak_table}`")
|
820
|
-
old_cnt = cursor.fetchone()['cnt']
|
821
|
-
if new_cnt == old_cnt:
|
822
|
-
cursor.execute(f"DROP TABLE `{database}`.`{bak_table}`")
|
823
|
-
conn.commit()
|
824
|
-
return True
|
825
|
-
else:
|
826
|
-
logger.error('id重排后数据量不一致,未删除备份表', {'库': database, '表': table, '新表行数': new_cnt, '备份表行数': old_cnt})
|
827
|
-
return False
|
828
|
-
|
829
|
-
def _rollback_table_swap(self, database: str, table: str, bak_table: str):
|
830
|
-
"""回滚:如bak表存在且原表不存在,则恢复原表名。"""
|
729
|
+
def reorder_id_column(
|
730
|
+
self,
|
731
|
+
database: str,
|
732
|
+
table: Optional[str] = None,
|
733
|
+
id_column: str = "id",
|
734
|
+
dry_run: bool = False,
|
735
|
+
auto_drop_backup: bool = True
|
736
|
+
) -> Any:
|
737
|
+
"""
|
738
|
+
安全重排指定表或指定库下所有表的id列为顺序自增(1,2,3...)。
|
739
|
+
Args:
|
740
|
+
database (str): 数据库名
|
741
|
+
table (Optional[str]): 表名,None时批量处理该库所有表
|
742
|
+
id_column (str): id列名,默认"id"
|
743
|
+
dry_run (bool): 是否为模拟运行
|
744
|
+
auto_drop_backup (bool): 校验通过后自动删除备份表
|
745
|
+
Returns:
|
746
|
+
bool 或 dict: 单表时bool,批量时{表名: bool}
|
747
|
+
"""
|
748
|
+
if not table:
|
749
|
+
# 批量模式,对库下所有表执行
|
750
|
+
try:
|
751
|
+
all_tables = self._get_tables(database)
|
752
|
+
except Exception as e:
|
753
|
+
logger.error('获取库下所有表失败', {"库": database, "异常": str(e)})
|
754
|
+
return {}
|
755
|
+
results = {}
|
756
|
+
for tbl in all_tables:
|
757
|
+
try:
|
758
|
+
res = self.reorder_id_column(database, tbl, id_column, dry_run, auto_drop_backup)
|
759
|
+
results[tbl] = res
|
760
|
+
except Exception as e:
|
761
|
+
logger.error('批量id重排异常', {"库": database, "表": tbl, "异常": str(e)})
|
762
|
+
results[tbl] = False
|
763
|
+
logger.info('批量id重排完成', {"库": database, "结果": results})
|
764
|
+
return results
|
765
|
+
# 单表模式
|
766
|
+
table_quoted = f"`{database}`.`{table}`"
|
767
|
+
if not self._acquire_table_lock(database, table):
|
768
|
+
logger.warning('表级锁获取失败,跳过id重排', {"库": database, "表": table})
|
769
|
+
return False
|
831
770
|
try:
|
771
|
+
# 检查表是否存在
|
772
|
+
if not self._check_table_exists(database, table):
|
773
|
+
logger.warning('表不存在,跳过id重排', {"库": database, "表": table})
|
774
|
+
return False
|
775
|
+
# 检查id列是否存在
|
832
776
|
with self._get_connection() as conn:
|
833
777
|
with conn.cursor() as cursor:
|
834
|
-
cursor.execute(
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
bak_table = f"{table}_bak_{int(time.time())}"
|
850
|
-
try:
|
851
|
-
# 1. 检查外键依赖
|
852
|
-
if self._has_foreign_key_dependency(database, table, pk):
|
853
|
-
logger.warning('存在外键依赖,拒绝重排id', {'库': database, '表': table})
|
778
|
+
cursor.execute("""
|
779
|
+
SELECT COLUMN_NAME, COLUMN_KEY
|
780
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
781
|
+
WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s
|
782
|
+
""", (database, table))
|
783
|
+
columns_info = cursor.fetchall()
|
784
|
+
columns = [row['COLUMN_NAME'] for row in columns_info]
|
785
|
+
id_is_pk = any(row['COLUMN_NAME'].lower() == id_column.lower() and row['COLUMN_KEY'] in ('PRI', 'UNI') for row in columns_info)
|
786
|
+
if id_column not in columns:
|
787
|
+
logger.warning('表无id列,跳过id重排', {"库": database, "表": table})
|
788
|
+
return False
|
789
|
+
# 检查主键是否为单列id
|
790
|
+
pk_cols = [row['COLUMN_NAME'] for row in columns_info if row['COLUMN_KEY'] == 'PRI']
|
791
|
+
if len(pk_cols) != 1 or pk_cols[0].lower() != id_column.lower():
|
792
|
+
logger.warning('主键不是单列id,跳过id重排', {"库": database, "表": table, "主键列": pk_cols})
|
854
793
|
return False
|
855
|
-
#
|
856
|
-
create_sql, pk_columns = self._get_table_create_sql_and_pk(database, table)
|
857
|
-
# 3. 生成临时表DDL
|
858
|
-
create_sql_temp = self._make_temp_table_sql(create_sql, table, temp_table, pk, pk_columns)
|
859
|
-
# 4. 创建临时表
|
794
|
+
# 检查外键约束
|
860
795
|
with self._get_connection() as conn:
|
861
796
|
with conn.cursor() as cursor:
|
862
|
-
cursor.execute(
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
797
|
+
cursor.execute("""
|
798
|
+
SELECT * FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
|
799
|
+
WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s AND REFERENCED_TABLE_NAME IS NOT NULL
|
800
|
+
""", (database, table))
|
801
|
+
if cursor.fetchone():
|
802
|
+
logger.warning('表存在外键约束,跳过id重排', {"库": database, "表": table})
|
803
|
+
return False
|
804
|
+
# 获取表结构
|
805
|
+
with self._get_connection() as conn:
|
806
|
+
with conn.cursor() as cursor:
|
807
|
+
cursor.execute(f"SHOW CREATE TABLE {table_quoted}")
|
808
|
+
create_table_sql = cursor.fetchone()['Create Table']
|
809
|
+
logger.info('开始id重排', {"库": database, "表": table, "重排列": id_column, "dry_run": dry_run, "DDL警告": "MySQL DDL操作不可回滚,建议提前备份!"})
|
810
|
+
if dry_run:
|
811
|
+
logger.info('dry_run模式,打印原表结构', {"库": database, "表": table, "建表语句": create_table_sql})
|
873
812
|
return True
|
874
|
-
|
813
|
+
temp_table = self._make_safe_table_name(table, prefix=f"temp_", suffix=f"_reorderid_{os.getpid()}_{threading.get_ident()}")
|
814
|
+
temp_table_quoted = f"`{database}`.`{temp_table}`"
|
815
|
+
backup_table = self._make_safe_table_name(table, prefix="backup_", suffix=f"_{int(time.time())}_{uuid.uuid4().hex[:8]}")
|
816
|
+
backup_table_quoted = f"`{database}`.`{backup_table}`"
|
817
|
+
try:
|
818
|
+
with self._get_connection() as conn:
|
819
|
+
with conn.cursor() as cursor:
|
820
|
+
# 1. 创建临时表,结构同原表
|
821
|
+
try:
|
822
|
+
cursor.execute(f"CREATE TABLE {temp_table_quoted} LIKE {table_quoted}")
|
823
|
+
except Exception as e:
|
824
|
+
logger.error('创建临时表失败', {"库": database, "表": table, "异常": str(e)})
|
825
|
+
return False
|
826
|
+
# 2. 插入数据,id列用ROW_NUMBER重排(MySQL 8+)
|
827
|
+
all_cols = ','.join([f'`{col}`' for col in columns])
|
828
|
+
all_cols_noid = ','.join([f'`{col}`' for col in columns if col != id_column])
|
829
|
+
insert_sql = f"""
|
830
|
+
INSERT INTO {temp_table_quoted} ({all_cols})
|
831
|
+
SELECT ROW_NUMBER() OVER (ORDER BY `{id_column}` ASC) as `{id_column}`, {all_cols_noid}
|
832
|
+
FROM {table_quoted}
|
833
|
+
"""
|
834
|
+
try:
|
835
|
+
cursor.execute(insert_sql)
|
836
|
+
except Exception as e:
|
837
|
+
logger.error('插入重排数据失败', {"库": database, "表": table, "异常": str(e)})
|
838
|
+
try:
|
839
|
+
cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
|
840
|
+
except Exception as drop_e:
|
841
|
+
logger.error('插入失败后删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
842
|
+
return False
|
843
|
+
# 如果id不是主键,尝试加主键(如不冲突)
|
844
|
+
if not id_is_pk:
|
845
|
+
try:
|
846
|
+
cursor.execute(f"ALTER TABLE {temp_table_quoted} ADD PRIMARY KEY(`{id_column}`)")
|
847
|
+
except Exception as e:
|
848
|
+
logger.warning('id列加主键失败,可能已存在其他主键', {"库": database, "表": table, "异常": str(e)})
|
849
|
+
# 3. 原表重命名为备份,临时表重命名为正式表
|
850
|
+
try:
|
851
|
+
cursor.execute(f"RENAME TABLE {table_quoted} TO {backup_table_quoted}, {temp_table_quoted} TO {table_quoted}")
|
852
|
+
except Exception as e:
|
853
|
+
logger.error('RENAME TABLE失败', {"库": database, "表": table, "异常": str(e)})
|
854
|
+
# 回滚:删除临时表
|
855
|
+
try:
|
856
|
+
cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
|
857
|
+
except Exception as drop_e:
|
858
|
+
logger.error('RENAME失败后删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
859
|
+
return False
|
860
|
+
# 4. 校验新表和备份表数据量一致
|
861
|
+
try:
|
862
|
+
cursor.execute(f"SELECT COUNT(*) as cnt FROM {table_quoted}")
|
863
|
+
new_cnt = cursor.fetchone()['cnt']
|
864
|
+
cursor.execute(f"SELECT COUNT(*) as cnt FROM {backup_table_quoted}")
|
865
|
+
old_cnt = cursor.fetchone()['cnt']
|
866
|
+
except Exception as e:
|
867
|
+
logger.error('校验数据量失败', {"库": database, "表": table, "异常": str(e)})
|
868
|
+
return False
|
869
|
+
if new_cnt != old_cnt:
|
870
|
+
logger.error('id重排后数据量不一致,自动回滚', {"库": database, "表": table, "新表": new_cnt, "备份表": old_cnt})
|
871
|
+
# 回滚:恢复原表
|
872
|
+
try:
|
873
|
+
cursor.execute(f"DROP TABLE {table_quoted}")
|
874
|
+
cursor.execute(f"RENAME TABLE {backup_table_quoted} TO {table_quoted}")
|
875
|
+
except Exception as e:
|
876
|
+
logger.error('回滚恢复原表失败', {"库": database, "表": table, "异常": str(e)})
|
877
|
+
return False
|
878
|
+
logger.info('id重排成功且数据量一致', {"库": database, "表": table, "新表": new_cnt, "备份表": old_cnt, "备份表名": backup_table})
|
879
|
+
# 5. 可选:自动删除备份表
|
880
|
+
if auto_drop_backup:
|
881
|
+
try:
|
882
|
+
cursor.execute(f"DROP TABLE {backup_table_quoted}")
|
883
|
+
logger.info('已自动删除备份表', {"库": database, "表": table, "备份表名": backup_table})
|
884
|
+
except Exception as e:
|
885
|
+
logger.error('自动删除备份表失败', {"库": database, "表": table, "异常": str(e)})
|
886
|
+
return True
|
887
|
+
except Exception as e:
|
888
|
+
logger.error('id重排异常,准备回滚', {"库": database, "表": table, "异常": str(e)})
|
889
|
+
# 回滚:如临时表存在则删掉,恢复原表结构
|
890
|
+
with self._get_connection() as conn:
|
891
|
+
with conn.cursor() as cursor:
|
892
|
+
try:
|
893
|
+
cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
|
894
|
+
except Exception as drop_e:
|
895
|
+
logger.error('回滚时删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
896
|
+
# 恢复原表(如备份表存在)
|
897
|
+
try:
|
898
|
+
with self._get_connection() as conn2:
|
899
|
+
with conn2.cursor() as cursor2:
|
900
|
+
if self._check_table_exists(database, backup_table):
|
901
|
+
cursor2.execute(f"DROP TABLE IF EXISTS {table_quoted}")
|
902
|
+
cursor2.execute(f"RENAME TABLE {backup_table_quoted} TO {table_quoted}")
|
903
|
+
logger.info('已自动恢复原表', {"库": database, "表": table, "备份表名": backup_table})
|
904
|
+
except Exception as recover_e:
|
905
|
+
logger.error('回滚时恢复原表失败', {"库": database, "表": table, "异常": str(recover_e)})
|
875
906
|
return False
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
907
|
+
finally:
|
908
|
+
self._release_table_lock(database, table)
|
909
|
+
|
910
|
+
@staticmethod
|
911
|
+
def _make_safe_table_name(base: str, prefix: str = '', suffix: str = '', max_length: int = 64) -> str:
|
912
|
+
"""
|
913
|
+
生成安全的MySQL表名,确保总长度不超过max_length字节。
|
914
|
+
:param base: 原始表名
|
915
|
+
:param prefix: 前缀
|
916
|
+
:param suffix: 后缀
|
917
|
+
:param max_length: 最大长度,默认64
|
918
|
+
:return: 安全表名
|
919
|
+
"""
|
920
|
+
# 只允许字母数字下划线
|
921
|
+
base = re.sub(r'[^a-zA-Z0-9_]', '_', base)
|
922
|
+
prefix = re.sub(r'[^a-zA-Z0-9_]', '_', prefix)
|
923
|
+
suffix = re.sub(r'[^a-zA-Z0-9_]', '_', suffix)
|
924
|
+
remain = max_length - len(prefix) - len(suffix)
|
925
|
+
if remain < 1:
|
926
|
+
# 前后缀太长,直接截断
|
927
|
+
return (prefix + suffix)[:max_length]
|
928
|
+
return f"{prefix}{base[:remain]}{suffix}"[:max_length]
|
880
929
|
|
881
930
|
|
882
931
|
def main():
|
@@ -888,17 +937,20 @@ def main():
|
|
888
937
|
)
|
889
938
|
|
890
939
|
# 全库去重(单线程)
|
891
|
-
deduplicator.deduplicate_all(dry_run=False, parallel=
|
940
|
+
deduplicator.deduplicate_all(dry_run=False, parallel=True)
|
892
941
|
|
893
942
|
# # 指定数据库去重(多线程)
|
894
|
-
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=
|
943
|
+
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=False)
|
895
944
|
|
896
945
|
# # 指定表去重(使用特定列)
|
897
|
-
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False
|
946
|
+
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False)
|
947
|
+
|
948
|
+
# # 重排id列
|
949
|
+
# deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
|
898
950
|
|
899
951
|
# 关闭连接
|
900
952
|
deduplicator.close()
|
901
953
|
|
902
954
|
if __name__ == '__main__':
|
903
|
-
|
955
|
+
main()
|
904
956
|
pass
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=JqV56ilza72jpkf_fztVtAdeSmcdPr0BmGGo9FFjGrA,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -8,7 +8,7 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
|
8
8
|
mdbq/log/mylogger.py,sha256=HuxLBCXjm6fZrxYE0rdpUCz359WGeqOX0vvg9jTuRY4,24126
|
9
9
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
|
-
mdbq/mysql/deduplicator.py,sha256=
|
11
|
+
mdbq/mysql/deduplicator.py,sha256=Znmjn4sI1Mj2koSPTDojFwg_1MTgk3GZTFZyhSRwn7s,46746
|
12
12
|
mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
|
13
13
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
14
14
|
mdbq/mysql/uploader.py,sha256=LxPlAfSNhQbLu-or4wxa-vLjCw5_PIN3ZVoksWUJazQ,61701
|
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
24
24
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
25
25
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
26
26
|
mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
|
27
|
-
mdbq-3.11.
|
28
|
-
mdbq-3.11.
|
29
|
-
mdbq-3.11.
|
30
|
-
mdbq-3.11.
|
27
|
+
mdbq-3.11.8.dist-info/METADATA,sha256=EJtaHsIzWmcB9hTRg1NZeDd55Zez0lu6FPD_ZQB9nMw,364
|
28
|
+
mdbq-3.11.8.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-3.11.8.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-3.11.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|