mdbq 3.11.6__py3-none-any.whl → 3.11.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.11.6'
1
+ VERSION = '3.11.8'
@@ -13,6 +13,7 @@ import concurrent.futures
13
13
  from collections import defaultdict
14
14
  import sys
15
15
  from datetime import datetime
16
+ import uuid
16
17
 
17
18
 
18
19
  warnings.filterwarnings('ignore')
@@ -80,7 +81,8 @@ class MySQLDeduplicator:
80
81
  date_column: str = '日期',
81
82
  exclude_columns: Optional[List[str]] = None,
82
83
  exclude_databases: Optional[List[str]] = None,
83
- exclude_tables: Optional[Dict[str, List[str]]] = None
84
+ exclude_tables: Optional[Dict[str, List[str]]] = None,
85
+ duplicate_keep_mode: str = 'keep_one'
84
86
  ) -> None:
85
87
  """
86
88
  初始化去重处理器
@@ -90,6 +92,7 @@ class MySQLDeduplicator:
90
92
  :param exclude_columns: 去重时排除的列名列表,默认为['id', '更新时间']
91
93
  :param exclude_databases: 排除的数据库名列表
92
94
  :param exclude_tables: 排除的表名字典 {数据库名: [表名, ...]}
95
+ :param duplicate_keep_mode: 'keep_one'(默认,重复组保留一条),'remove_all'(全部删除重复组)
93
96
  """
94
97
  # 连接池状态标志
95
98
  self._closed = False
@@ -173,6 +176,8 @@ class MySQLDeduplicator:
173
176
  self.exclude_databases = set([db.lower() for db in exclude_databases]) if exclude_databases else set()
174
177
  self.exclude_tables = {k.lower(): set([t.lower() for t in v]) for k, v in (exclude_tables or {}).items()}
175
178
 
179
+ self.duplicate_keep_mode = duplicate_keep_mode if duplicate_keep_mode in ('keep_one', 'remove_all') else 'keep_one'
180
+
176
181
  def _get_connection(self) -> pymysql.connections.Connection:
177
182
  """
178
183
  从连接池获取一个数据库连接。
@@ -187,7 +192,6 @@ class MySQLDeduplicator:
187
192
  raise ConnectionError("连接池已关闭")
188
193
  try:
189
194
  conn = self.pool.connection()
190
- logger.debug("成功获取数据库连接")
191
195
  return conn
192
196
  except Exception as e:
193
197
  logger.error(f"获取数据库连接失败: {str(e)}", {'error_type': type(e).__name__})
@@ -329,8 +333,7 @@ class MySQLDeduplicator:
329
333
  database: str,
330
334
  table: str,
331
335
  columns: Optional[List[str]] = None,
332
- dry_run: bool = False,
333
- reset_id: bool = False
336
+ dry_run: bool = False
334
337
  ) -> Tuple[int, int]:
335
338
  """
336
339
  执行单表去重。
@@ -340,7 +343,6 @@ class MySQLDeduplicator:
340
343
  table (str): 表名。
341
344
  columns (Optional[List[str]]): 用于去重的列名列表(为None时使用所有列)。
342
345
  dry_run (bool): 是否为模拟运行(只统计不实际删除)。
343
- reset_id (bool): 是否在去重后重排id。
344
346
  Returns:
345
347
  Tuple[int, int]: (重复组数, 实际删除行数)。
346
348
  """
@@ -376,8 +378,7 @@ class MySQLDeduplicator:
376
378
  return (0, 0)
377
379
  # 统一用反引号包裹
378
380
  column_list = ', '.join([f'`{col}`' for col in use_columns])
379
- temp_table = f"temp_{table}_dedup_{os.getpid()}_{threading.get_ident()}"
380
- temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)[:60]
381
+ temp_table = self._make_safe_table_name(table, prefix=f"temp_", suffix=f"_dedup_{os.getpid()}_{threading.get_ident()}")
381
382
  pk = self.primary_key
382
383
  # 主键判断也用小写
383
384
  if pk.lower() not in all_columns_lower and pk != 'id':
@@ -416,14 +417,28 @@ class MySQLDeduplicator:
416
417
  if not dry_run:
417
418
  # 分批删除,避免锁表
418
419
  while True:
419
- delete_dup_sql = f"""
420
- DELETE FROM `{database}`.`{table}`
421
- WHERE `{pk_real}` NOT IN (
422
- SELECT `min_id` FROM `{database}`.`{temp_table}`
423
- ) {'AND' if use_time_filter else ''} {f'`{time_col}` >= \'{self._dedup_start_date}\' AND `{time_col}` <= \'{self._dedup_end_date}\'' if use_time_filter else ''}
424
- AND ({' AND '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
425
- LIMIT {self.batch_size}
426
- """
420
+ if self.duplicate_keep_mode == 'remove_all':
421
+ # 删除所有重复组的所有记录
422
+ delete_dup_sql = f"""
423
+ DELETE FROM `{database}`.`{table}`
424
+ WHERE ({', '.join([f'`{col}`' for col in use_columns])}) IN (
425
+ SELECT {column_list} FROM `{database}`.`{temp_table}`
426
+ ) {'AND' if use_time_filter else ''} {f'`{time_col}` >= \'{self._dedup_start_date}\' AND `{time_col}` <= \'{self._dedup_end_date}\'' if use_time_filter else ''}
427
+ LIMIT {self.batch_size}
428
+ """
429
+ else:
430
+ # 修正:只删除重复组中不是min_id的行,唯一数据不动
431
+ delete_dup_sql = f"""
432
+ DELETE FROM `{database}`.`{table}` t
433
+ WHERE EXISTS (
434
+ SELECT 1 FROM `{database}`.`{temp_table}` tmp
435
+ WHERE
436
+ {' AND '.join([f't.`{col}` <=> tmp.`{col}`' for col in use_columns])}
437
+ AND t.`{pk_real}` <> tmp.`min_id`
438
+ )
439
+ {'AND' if use_time_filter else ''} {f't.`{time_col}` >= \'{self._dedup_start_date}\' AND t.`{time_col}` <= \'{self._dedup_end_date}\'' if use_time_filter else ''}
440
+ LIMIT {self.batch_size}
441
+ """
427
442
  logger.debug('执行删除重复数据SQL', {'sql': delete_dup_sql})
428
443
  cursor.execute(delete_dup_sql)
429
444
  batch_deleted = cursor.rowcount
@@ -431,10 +446,7 @@ class MySQLDeduplicator:
431
446
  conn.commit()
432
447
  if batch_deleted < self.batch_size:
433
448
  break
434
- logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns})
435
- # 新增:去重后重排id
436
- if reset_id and affected_rows > 0:
437
- self._reset_id_column(database, table)
449
+ logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns, "去重模式": self.duplicate_keep_mode})
438
450
  else:
439
451
  logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组数": dup_count, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None})
440
452
  affected_rows = 0
@@ -463,8 +475,7 @@ class MySQLDeduplicator:
463
475
  database: str,
464
476
  table: str,
465
477
  columns: Optional[List[str]] = None,
466
- dry_run: bool = False,
467
- reset_id: bool = False
478
+ dry_run: bool = False
468
479
  ) -> Tuple[int, int]:
469
480
  """
470
481
  对指定表进行去重。
@@ -474,7 +485,6 @@ class MySQLDeduplicator:
474
485
  table (str): 表名。
475
486
  columns (Optional[List[str]]): 用于去重的列名列表(为None时使用所有列)。
476
487
  dry_run (bool): 是否为模拟运行(只统计不实际删除)。
477
- reset_id (bool): 是否在去重后重排id。
478
488
  Returns:
479
489
  Tuple[int, int]: (重复组数, 实际删除行数)。
480
490
  """
@@ -486,7 +496,7 @@ class MySQLDeduplicator:
486
496
  logger.warning('表不存在', {"库": database, "表": table, "warning": "跳过"})
487
497
  return (0, 0)
488
498
  logger.info('单表开始', {"库": database, "表": table, "参数": {"指定去重列": columns, "模拟运行": dry_run, '排除列': self.exclude_columns}})
489
- result = self._deduplicate_table(database, table, columns, dry_run, reset_id)
499
+ result = self._deduplicate_table(database, table, columns, dry_run)
490
500
  logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result})
491
501
  return result
492
502
  except Exception as e:
@@ -499,8 +509,7 @@ class MySQLDeduplicator:
499
509
  tables: Optional[List[str]] = None,
500
510
  columns_map: Optional[Dict[str, List[str]]] = None,
501
511
  dry_run: bool = False,
502
- parallel: bool = False,
503
- reset_id: bool = False
512
+ parallel: bool = False
504
513
  ) -> Dict[str, Tuple[int, int]]:
505
514
  """
506
515
  对指定数据库的所有表进行去重。
@@ -511,7 +520,6 @@ class MySQLDeduplicator:
511
520
  columns_map (Optional[Dict[str, List[str]]]): 各表使用的去重列 {表名: [列名]}。
512
521
  dry_run (bool): 是否为模拟运行。
513
522
  parallel (bool): 是否并行处理。
514
- reset_id (bool): 是否在去重后重排id。
515
523
  Returns:
516
524
  Dict[str, Tuple[int, int]]: {表名: (重复组数, 实际删除行数)}。
517
525
  """
@@ -540,7 +548,7 @@ class MySQLDeduplicator:
540
548
  logger.debug('提交表去重任务', {'库': database, '表': table, 'columns': columns})
541
549
  futures[executor.submit(
542
550
  self.deduplicate_table,
543
- database, table, columns, dry_run, reset_id
551
+ database, table, columns, dry_run
544
552
  )] = table
545
553
  for future in concurrent.futures.as_completed(futures):
546
554
  table = futures[future]
@@ -556,7 +564,7 @@ class MySQLDeduplicator:
556
564
  for table in target_tables:
557
565
  columns = columns_map.get(table) if columns_map else None
558
566
  dup_count, affected_rows = self.deduplicate_table(
559
- database, table, columns, dry_run, reset_id
567
+ database, table, columns, dry_run
560
568
  )
561
569
  results[table] = (dup_count, affected_rows)
562
570
  total_dup = sum(r[0] for r in results.values())
@@ -573,8 +581,7 @@ class MySQLDeduplicator:
573
581
  tables_map: Optional[Dict[str, List[str]]] = None,
574
582
  columns_map: Optional[Dict[str, Dict[str, List[str]]]] = None,
575
583
  dry_run: bool = False,
576
- parallel: bool = False,
577
- reset_id: bool = False
584
+ parallel: bool = False
578
585
  ) -> Dict[str, Dict[str, Tuple[int, int]]]:
579
586
  """
580
587
  对所有数据库进行去重。
@@ -585,7 +592,6 @@ class MySQLDeduplicator:
585
592
  columns_map (Optional[Dict[str, Dict[str, List[str]]]]): 指定每个表去重时使用的列,格式为 {数据库名: {表名: [列名, ...]}}。如果为 None,则使用所有列。
586
593
  dry_run (bool): 是否为模拟运行模式。为 True 时只统计重复行数,不实际删除。
587
594
  parallel (bool): 是否并行处理多个数据库。为 True 时使用线程池并发处理。
588
- reset_id (bool): 是否在去重后重排id。
589
595
  Returns:
590
596
  Dict[str, Dict[str, Tuple[int, int]]]: 嵌套字典,格式为 {数据库名: {表名: (重复组数, 实际删除行数)}}。
591
597
  """
@@ -609,7 +615,7 @@ class MySQLDeduplicator:
609
615
  db_columns_map = columns_map.get(db) if columns_map else None
610
616
  futures[executor.submit(
611
617
  self.deduplicate_database,
612
- db, tables, db_columns_map, dry_run, False, reset_id
618
+ db, tables, db_columns_map, dry_run, False
613
619
  )] = db
614
620
  for future in concurrent.futures.as_completed(futures):
615
621
  db = futures[future]
@@ -625,7 +631,7 @@ class MySQLDeduplicator:
625
631
  tables = tables_map.get(db) if tables_map else None
626
632
  db_columns_map = columns_map.get(db) if columns_map else None
627
633
  db_results = self.deduplicate_database(
628
- db, tables, db_columns_map, dry_run, parallel, reset_id
634
+ db, tables, db_columns_map, dry_run, parallel
629
635
  )
630
636
  all_results[db] = db_results
631
637
  total_dup = sum(
@@ -720,145 +726,206 @@ class MySQLDeduplicator:
720
726
  """
721
727
  self.close()
722
728
 
723
- def _has_foreign_key_dependency(self, database: str, table: str, pk: str) -> bool:
724
- """检测id列是否被其他表外键引用。"""
725
- fk_check_sql = '''
726
- SELECT TABLE_NAME, COLUMN_NAME, CONSTRAINT_NAME, REFERENCED_TABLE_NAME, REFERENCED_COLUMN_NAME
727
- FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
728
- WHERE REFERENCED_TABLE_SCHEMA = %s AND REFERENCED_TABLE_NAME = %s AND REFERENCED_COLUMN_NAME = %s
729
- '''
730
- with self._get_connection() as conn:
731
- with conn.cursor() as cursor:
732
- cursor.execute(f"USE `{database}`")
733
- cursor.execute(fk_check_sql, (database, table, pk))
734
- fk_rows = cursor.fetchall()
735
- return bool(fk_rows)
736
-
737
- def _get_table_create_sql_and_pk(self, database: str, table: str) -> tuple:
738
- """获取表的CREATE语句和主键字段列表。"""
739
- with self._get_connection() as conn:
740
- with conn.cursor() as cursor:
741
- cursor.execute(f"USE `{database}`")
742
- cursor.execute(f"SHOW CREATE TABLE `{database}`.`{table}`")
743
- create_sql = cursor.fetchone()['Create Table']
744
- cursor.execute(f"SHOW KEYS FROM `{database}`.`{table}` WHERE Key_name = 'PRIMARY'")
745
- pk_rows = cursor.fetchall()
746
- pk_columns = [row['Column_name'] for row in pk_rows]
747
- return create_sql, pk_columns
748
-
749
- def _make_temp_table_sql(self, create_sql: str, table: str, temp_table: str, pk: str, pk_columns: list) -> str:
750
- """生成临时表的CREATE语句,仅替换id字段类型。"""
751
- def replace_id_type(sql):
752
- lines = sql.split('\n')
753
- new_lines = []
754
- for line in lines:
755
- if re.match(rf'\s*`{pk}` ', line):
756
- if pk_columns == [pk]:
757
- line = re.sub(r'`' + pk + r'`\s+[^,]*', f'`{pk}` INT NOT NULL AUTO_INCREMENT', line)
758
- else:
759
- line = re.sub(r'`' + pk + r'`\s+[^,]*', f'`{pk}` INT NOT NULL', line)
760
- new_lines.append(line)
761
- return '\n'.join(new_lines)
762
- create_sql_temp = re.sub(
763
- rf'CREATE TABLE `{table}`',
764
- f'CREATE TABLE `{temp_table}`',
765
- create_sql,
766
- count=1
767
- )
768
- create_sql_temp = replace_id_type(create_sql_temp)
769
- create_sql_temp = re.sub(r'AUTO_INCREMENT=\d+', '', create_sql_temp)
770
- return create_sql_temp
771
-
772
- def _create_and_fill_temp_table(self, database: str, table: str, temp_table: str, pk: str) -> list:
773
- """创建临时表并插入重排id数据,返回所有字段名。"""
774
- with self._get_connection() as conn:
775
- with conn.cursor() as cursor:
776
- cursor.execute(f"USE `{database}`")
777
- cursor.execute(f"SHOW COLUMNS FROM `{database}`.`{table}`")
778
- columns = [row['Field'] for row in cursor.fetchall()]
779
- columns_wo_id = [col for col in columns if col != pk]
780
- col_list = ', '.join([f'`{col}`' for col in columns_wo_id])
781
- insert_sql = f"INSERT INTO `{database}`.`{temp_table}` ({col_list}, `{pk}`) SELECT {col_list}, (@rownum:=@rownum+1) as `{pk}` FROM `{database}`.`{table}` JOIN (SELECT @rownum:=0) r ORDER BY `{pk}` ASC"
782
- cursor.execute(insert_sql)
783
- return columns
784
-
785
- def _swap_tables_with_backup(self, database: str, table: str, temp_table: str, bak_table: str):
786
- """原表重命名为备份,临时表变原表名。"""
787
- with self._get_connection() as conn:
788
- with conn.cursor() as cursor:
789
- cursor.execute(f"USE `{database}`")
790
- cursor.execute(f"RENAME TABLE `{database}`.`{table}` TO `{database}`.`{bak_table}`")
791
- cursor.execute(f"RENAME TABLE `{database}`.`{temp_table}` TO `{database}`.`{table}`")
792
- conn.commit()
793
-
794
- def _check_and_cleanup_backup(self, database: str, table: str, bak_table: str) -> bool:
795
- """校验新表和备份表数据量一致,安全删除备份表。"""
796
- with self._get_connection() as conn:
797
- with conn.cursor() as cursor:
798
- cursor.execute(f"USE `{database}`")
799
- cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`")
800
- new_cnt = cursor.fetchone()['cnt']
801
- cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{bak_table}`")
802
- old_cnt = cursor.fetchone()['cnt']
803
- if new_cnt == old_cnt:
804
- cursor.execute(f"DROP TABLE `{database}`.`{bak_table}`")
805
- conn.commit()
806
- return True
807
- else:
808
- logger.error('id重排后数据量不一致,未删除备份表', {'库': database, '表': table, '新表行数': new_cnt, '备份表行数': old_cnt})
809
- return False
810
-
811
- def _rollback_table_swap(self, database: str, table: str, bak_table: str):
812
- """回滚:如bak表存在且原表不存在,则恢复原表名。"""
729
+ def reorder_id_column(
730
+ self,
731
+ database: str,
732
+ table: Optional[str] = None,
733
+ id_column: str = "id",
734
+ dry_run: bool = False,
735
+ auto_drop_backup: bool = True
736
+ ) -> Any:
737
+ """
738
+ 安全重排指定表或指定库下所有表的id列为顺序自增(1,2,3...)。
739
+ Args:
740
+ database (str): 数据库名
741
+ table (Optional[str]): 表名,None时批量处理该库所有表
742
+ id_column (str): id列名,默认"id"
743
+ dry_run (bool): 是否为模拟运行
744
+ auto_drop_backup (bool): 校验通过后自动删除备份表
745
+ Returns:
746
+ bool dict: 单表时bool,批量时{表名: bool}
747
+ """
748
+ if not table:
749
+ # 批量模式,对库下所有表执行
750
+ try:
751
+ all_tables = self._get_tables(database)
752
+ except Exception as e:
753
+ logger.error('获取库下所有表失败', {"库": database, "异常": str(e)})
754
+ return {}
755
+ results = {}
756
+ for tbl in all_tables:
757
+ try:
758
+ res = self.reorder_id_column(database, tbl, id_column, dry_run, auto_drop_backup)
759
+ results[tbl] = res
760
+ except Exception as e:
761
+ logger.error('批量id重排异常', {"库": database, "表": tbl, "异常": str(e)})
762
+ results[tbl] = False
763
+ logger.info('批量id重排完成', {"库": database, "结果": results})
764
+ return results
765
+ # 单表模式
766
+ table_quoted = f"`{database}`.`{table}`"
767
+ if not self._acquire_table_lock(database, table):
768
+ logger.warning('表级锁获取失败,跳过id重排', {"库": database, "表": table})
769
+ return False
813
770
  try:
771
+ # 检查表是否存在
772
+ if not self._check_table_exists(database, table):
773
+ logger.warning('表不存在,跳过id重排', {"库": database, "表": table})
774
+ return False
775
+ # 检查id列是否存在
814
776
  with self._get_connection() as conn:
815
777
  with conn.cursor() as cursor:
816
- cursor.execute(f"USE `{database}`")
817
- cursor.execute(f"SHOW TABLES LIKE '{bak_table}'")
818
- if cursor.fetchone():
819
- cursor.execute(f"SHOW TABLES LIKE '{table}'")
820
- if not cursor.fetchone():
821
- cursor.execute(f"RENAME TABLE `{database}`.`{bak_table}` TO `{database}`.`{table}`")
822
- conn.commit()
823
- logger.info('回滚成功,已恢复原表', {'': database, '': table})
824
- except Exception as e2:
825
- logger.error('回滚失败', {'': database, '': table, '异常': str(e2)})
826
-
827
- def _reset_id_column(self, database: str, table: str) -> bool:
828
- pk = self.primary_key
829
- temp_table = f"temp_{table}_resetid_{os.getpid()}_{threading.get_ident()}"
830
- temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)[:60]
831
- bak_table = f"{table}_bak_{int(time.time())}"
832
- try:
833
- # 1. 检查外键依赖
834
- if self._has_foreign_key_dependency(database, table, pk):
835
- logger.warning('存在外键依赖,拒绝重排id', {'库': database, '表': table})
778
+ cursor.execute("""
779
+ SELECT COLUMN_NAME, COLUMN_KEY
780
+ FROM INFORMATION_SCHEMA.COLUMNS
781
+ WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s
782
+ """, (database, table))
783
+ columns_info = cursor.fetchall()
784
+ columns = [row['COLUMN_NAME'] for row in columns_info]
785
+ id_is_pk = any(row['COLUMN_NAME'].lower() == id_column.lower() and row['COLUMN_KEY'] in ('PRI', 'UNI') for row in columns_info)
786
+ if id_column not in columns:
787
+ logger.warning('表无id列,跳过id重排', {"": database, "": table})
788
+ return False
789
+ # 检查主键是否为单列id
790
+ pk_cols = [row['COLUMN_NAME'] for row in columns_info if row['COLUMN_KEY'] == 'PRI']
791
+ if len(pk_cols) != 1 or pk_cols[0].lower() != id_column.lower():
792
+ logger.warning('主键不是单列id,跳过id重排', {"库": database, "表": table, "主键列": pk_cols})
836
793
  return False
837
- # 2. 获取表结构和主键
838
- create_sql, pk_columns = self._get_table_create_sql_and_pk(database, table)
839
- # 3. 生成临时表DDL
840
- create_sql_temp = self._make_temp_table_sql(create_sql, table, temp_table, pk, pk_columns)
841
- # 4. 创建临时表
794
+ # 检查外键约束
842
795
  with self._get_connection() as conn:
843
796
  with conn.cursor() as cursor:
844
- cursor.execute(f"USE `{database}`")
845
- cursor.execute(f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`")
846
- cursor.execute(create_sql_temp)
847
- conn.commit()
848
- # 5. 填充临时表
849
- self._create_and_fill_temp_table(database, table, temp_table, pk)
850
- # 6. 表交换
851
- self._swap_tables_with_backup(database, table, temp_table, bak_table)
852
- # 7. 校验和清理
853
- if self._check_and_cleanup_backup(database, table, bak_table):
854
- logger.info('id重排完成并安全删除备份表,主键信息已保留', {'库': database, '表': table})
797
+ cursor.execute("""
798
+ SELECT * FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
799
+ WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s AND REFERENCED_TABLE_NAME IS NOT NULL
800
+ """, (database, table))
801
+ if cursor.fetchone():
802
+ logger.warning('表存在外键约束,跳过id重排', {"库": database, "表": table})
803
+ return False
804
+ # 获取表结构
805
+ with self._get_connection() as conn:
806
+ with conn.cursor() as cursor:
807
+ cursor.execute(f"SHOW CREATE TABLE {table_quoted}")
808
+ create_table_sql = cursor.fetchone()['Create Table']
809
+ logger.info('开始id重排', {"库": database, "表": table, "重排列": id_column, "dry_run": dry_run, "DDL警告": "MySQL DDL操作不可回滚,建议提前备份!"})
810
+ if dry_run:
811
+ logger.info('dry_run模式,打印原表结构', {"库": database, "表": table, "建表语句": create_table_sql})
855
812
  return True
856
- else:
813
+ temp_table = self._make_safe_table_name(table, prefix=f"temp_", suffix=f"_reorderid_{os.getpid()}_{threading.get_ident()}")
814
+ temp_table_quoted = f"`{database}`.`{temp_table}`"
815
+ backup_table = self._make_safe_table_name(table, prefix="backup_", suffix=f"_{int(time.time())}_{uuid.uuid4().hex[:8]}")
816
+ backup_table_quoted = f"`{database}`.`{backup_table}`"
817
+ try:
818
+ with self._get_connection() as conn:
819
+ with conn.cursor() as cursor:
820
+ # 1. 创建临时表,结构同原表
821
+ try:
822
+ cursor.execute(f"CREATE TABLE {temp_table_quoted} LIKE {table_quoted}")
823
+ except Exception as e:
824
+ logger.error('创建临时表失败', {"库": database, "表": table, "异常": str(e)})
825
+ return False
826
+ # 2. 插入数据,id列用ROW_NUMBER重排(MySQL 8+)
827
+ all_cols = ','.join([f'`{col}`' for col in columns])
828
+ all_cols_noid = ','.join([f'`{col}`' for col in columns if col != id_column])
829
+ insert_sql = f"""
830
+ INSERT INTO {temp_table_quoted} ({all_cols})
831
+ SELECT ROW_NUMBER() OVER (ORDER BY `{id_column}` ASC) as `{id_column}`, {all_cols_noid}
832
+ FROM {table_quoted}
833
+ """
834
+ try:
835
+ cursor.execute(insert_sql)
836
+ except Exception as e:
837
+ logger.error('插入重排数据失败', {"库": database, "表": table, "异常": str(e)})
838
+ try:
839
+ cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
840
+ except Exception as drop_e:
841
+ logger.error('插入失败后删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
842
+ return False
843
+ # 如果id不是主键,尝试加主键(如不冲突)
844
+ if not id_is_pk:
845
+ try:
846
+ cursor.execute(f"ALTER TABLE {temp_table_quoted} ADD PRIMARY KEY(`{id_column}`)")
847
+ except Exception as e:
848
+ logger.warning('id列加主键失败,可能已存在其他主键', {"库": database, "表": table, "异常": str(e)})
849
+ # 3. 原表重命名为备份,临时表重命名为正式表
850
+ try:
851
+ cursor.execute(f"RENAME TABLE {table_quoted} TO {backup_table_quoted}, {temp_table_quoted} TO {table_quoted}")
852
+ except Exception as e:
853
+ logger.error('RENAME TABLE失败', {"库": database, "表": table, "异常": str(e)})
854
+ # 回滚:删除临时表
855
+ try:
856
+ cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
857
+ except Exception as drop_e:
858
+ logger.error('RENAME失败后删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
859
+ return False
860
+ # 4. 校验新表和备份表数据量一致
861
+ try:
862
+ cursor.execute(f"SELECT COUNT(*) as cnt FROM {table_quoted}")
863
+ new_cnt = cursor.fetchone()['cnt']
864
+ cursor.execute(f"SELECT COUNT(*) as cnt FROM {backup_table_quoted}")
865
+ old_cnt = cursor.fetchone()['cnt']
866
+ except Exception as e:
867
+ logger.error('校验数据量失败', {"库": database, "表": table, "异常": str(e)})
868
+ return False
869
+ if new_cnt != old_cnt:
870
+ logger.error('id重排后数据量不一致,自动回滚', {"库": database, "表": table, "新表": new_cnt, "备份表": old_cnt})
871
+ # 回滚:恢复原表
872
+ try:
873
+ cursor.execute(f"DROP TABLE {table_quoted}")
874
+ cursor.execute(f"RENAME TABLE {backup_table_quoted} TO {table_quoted}")
875
+ except Exception as e:
876
+ logger.error('回滚恢复原表失败', {"库": database, "表": table, "异常": str(e)})
877
+ return False
878
+ logger.info('id重排成功且数据量一致', {"库": database, "表": table, "新表": new_cnt, "备份表": old_cnt, "备份表名": backup_table})
879
+ # 5. 可选:自动删除备份表
880
+ if auto_drop_backup:
881
+ try:
882
+ cursor.execute(f"DROP TABLE {backup_table_quoted}")
883
+ logger.info('已自动删除备份表', {"库": database, "表": table, "备份表名": backup_table})
884
+ except Exception as e:
885
+ logger.error('自动删除备份表失败', {"库": database, "表": table, "异常": str(e)})
886
+ return True
887
+ except Exception as e:
888
+ logger.error('id重排异常,准备回滚', {"库": database, "表": table, "异常": str(e)})
889
+ # 回滚:如临时表存在则删掉,恢复原表结构
890
+ with self._get_connection() as conn:
891
+ with conn.cursor() as cursor:
892
+ try:
893
+ cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
894
+ except Exception as drop_e:
895
+ logger.error('回滚时删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
896
+ # 恢复原表(如备份表存在)
897
+ try:
898
+ with self._get_connection() as conn2:
899
+ with conn2.cursor() as cursor2:
900
+ if self._check_table_exists(database, backup_table):
901
+ cursor2.execute(f"DROP TABLE IF EXISTS {table_quoted}")
902
+ cursor2.execute(f"RENAME TABLE {backup_table_quoted} TO {table_quoted}")
903
+ logger.info('已自动恢复原表', {"库": database, "表": table, "备份表名": backup_table})
904
+ except Exception as recover_e:
905
+ logger.error('回滚时恢复原表失败', {"库": database, "表": table, "异常": str(recover_e)})
857
906
  return False
858
- except Exception as e:
859
- logger.error('id重排失败,尝试回滚', {'库': database, '表': table, '异常': str(e)})
860
- self._rollback_table_swap(database, table, bak_table)
861
- return False
907
+ finally:
908
+ self._release_table_lock(database, table)
909
+
910
+ @staticmethod
911
+ def _make_safe_table_name(base: str, prefix: str = '', suffix: str = '', max_length: int = 64) -> str:
912
+ """
913
+ 生成安全的MySQL表名,确保总长度不超过max_length字节。
914
+ :param base: 原始表名
915
+ :param prefix: 前缀
916
+ :param suffix: 后缀
917
+ :param max_length: 最大长度,默认64
918
+ :return: 安全表名
919
+ """
920
+ # 只允许字母数字下划线
921
+ base = re.sub(r'[^a-zA-Z0-9_]', '_', base)
922
+ prefix = re.sub(r'[^a-zA-Z0-9_]', '_', prefix)
923
+ suffix = re.sub(r'[^a-zA-Z0-9_]', '_', suffix)
924
+ remain = max_length - len(prefix) - len(suffix)
925
+ if remain < 1:
926
+ # 前后缀太长,直接截断
927
+ return (prefix + suffix)[:max_length]
928
+ return f"{prefix}{base[:remain]}{suffix}"[:max_length]
862
929
 
863
930
 
864
931
  def main():
@@ -870,14 +937,16 @@ def main():
870
937
  )
871
938
 
872
939
  # 全库去重(单线程)
873
- deduplicator.deduplicate_all(dry_run=False, parallel=False, reset_id=False)
940
+ deduplicator.deduplicate_all(dry_run=False, parallel=True)
874
941
 
875
942
  # # 指定数据库去重(多线程)
876
- # logger.info('调用deduplicate_database')
877
- # deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True, reset_id=False)
943
+ # deduplicator.deduplicate_database('my_db', dry_run=False, parallel=False)
878
944
 
879
945
  # # 指定表去重(使用特定列)
880
- # deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False, reset_id=False)
946
+ # deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False)
947
+
948
+ # # 重排id列
949
+ # deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
881
950
 
882
951
  # 关闭连接
883
952
  deduplicator.close()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 3.11.6
3
+ Version: 3.11.8
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,5 +1,5 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=jUdj4-uaa03JUoNnXK_fTx_XQfDwjeFprE71R3ZenRY,18
2
+ mdbq/__version__.py,sha256=JqV56ilza72jpkf_fztVtAdeSmcdPr0BmGGo9FFjGrA,18
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
4
  mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
5
5
  mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
@@ -8,7 +8,7 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
8
8
  mdbq/log/mylogger.py,sha256=HuxLBCXjm6fZrxYE0rdpUCz359WGeqOX0vvg9jTuRY4,24126
9
9
  mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
10
10
  mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
11
- mdbq/mysql/deduplicator.py,sha256=XSAgt6HqvzDyZSiv4mHli5fA3p3ePn5g3HupqI2cyVo,41444
11
+ mdbq/mysql/deduplicator.py,sha256=Znmjn4sI1Mj2koSPTDojFwg_1MTgk3GZTFZyhSRwn7s,46746
12
12
  mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
13
13
  mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
14
14
  mdbq/mysql/uploader.py,sha256=LxPlAfSNhQbLu-or4wxa-vLjCw5_PIN3ZVoksWUJazQ,61701
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
24
24
  mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
25
25
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
26
26
  mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
27
- mdbq-3.11.6.dist-info/METADATA,sha256=gEcCBqGoPLhugYleGDv1r2YV_lHPL3AnGHD_dHTxY3Q,364
28
- mdbq-3.11.6.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
- mdbq-3.11.6.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
- mdbq-3.11.6.dist-info/RECORD,,
27
+ mdbq-3.11.8.dist-info/METADATA,sha256=EJtaHsIzWmcB9hTRg1NZeDd55Zez0lu6FPD_ZQB9nMw,364
28
+ mdbq-3.11.8.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
+ mdbq-3.11.8.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
+ mdbq-3.11.8.dist-info/RECORD,,
File without changes