mdbq 3.11.5__py3-none-any.whl → 3.11.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.11.5'
1
+ VERSION = '3.11.6'
@@ -329,7 +329,8 @@ class MySQLDeduplicator:
329
329
  database: str,
330
330
  table: str,
331
331
  columns: Optional[List[str]] = None,
332
- dry_run: bool = False
332
+ dry_run: bool = False,
333
+ reset_id: bool = False
333
334
  ) -> Tuple[int, int]:
334
335
  """
335
336
  执行单表去重。
@@ -339,6 +340,7 @@ class MySQLDeduplicator:
339
340
  table (str): 表名。
340
341
  columns (Optional[List[str]]): 用于去重的列名列表(为None时使用所有列)。
341
342
  dry_run (bool): 是否为模拟运行(只统计不实际删除)。
343
+ reset_id (bool): 是否在去重后重排id。
342
344
  Returns:
343
345
  Tuple[int, int]: (重复组数, 实际删除行数)。
344
346
  """
@@ -430,6 +432,9 @@ class MySQLDeduplicator:
430
432
  if batch_deleted < self.batch_size:
431
433
  break
432
434
  logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns})
435
+ # 新增:去重后重排id
436
+ if reset_id and affected_rows > 0:
437
+ self._reset_id_column(database, table)
433
438
  else:
434
439
  logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组数": dup_count, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None})
435
440
  affected_rows = 0
@@ -458,7 +463,8 @@ class MySQLDeduplicator:
458
463
  database: str,
459
464
  table: str,
460
465
  columns: Optional[List[str]] = None,
461
- dry_run: bool = False
466
+ dry_run: bool = False,
467
+ reset_id: bool = False
462
468
  ) -> Tuple[int, int]:
463
469
  """
464
470
  对指定表进行去重。
@@ -468,6 +474,7 @@ class MySQLDeduplicator:
468
474
  table (str): 表名。
469
475
  columns (Optional[List[str]]): 用于去重的列名列表(为None时使用所有列)。
470
476
  dry_run (bool): 是否为模拟运行(只统计不实际删除)。
477
+ reset_id (bool): 是否在去重后重排id。
471
478
  Returns:
472
479
  Tuple[int, int]: (重复组数, 实际删除行数)。
473
480
  """
@@ -479,7 +486,7 @@ class MySQLDeduplicator:
479
486
  logger.warning('表不存在', {"库": database, "表": table, "warning": "跳过"})
480
487
  return (0, 0)
481
488
  logger.info('单表开始', {"库": database, "表": table, "参数": {"指定去重列": columns, "模拟运行": dry_run, '排除列': self.exclude_columns}})
482
- result = self._deduplicate_table(database, table, columns, dry_run)
489
+ result = self._deduplicate_table(database, table, columns, dry_run, reset_id)
483
490
  logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result})
484
491
  return result
485
492
  except Exception as e:
@@ -492,7 +499,8 @@ class MySQLDeduplicator:
492
499
  tables: Optional[List[str]] = None,
493
500
  columns_map: Optional[Dict[str, List[str]]] = None,
494
501
  dry_run: bool = False,
495
- parallel: bool = False
502
+ parallel: bool = False,
503
+ reset_id: bool = False
496
504
  ) -> Dict[str, Tuple[int, int]]:
497
505
  """
498
506
  对指定数据库的所有表进行去重。
@@ -503,6 +511,7 @@ class MySQLDeduplicator:
503
511
  columns_map (Optional[Dict[str, List[str]]]): 各表使用的去重列 {表名: [列名]}。
504
512
  dry_run (bool): 是否为模拟运行。
505
513
  parallel (bool): 是否并行处理。
514
+ reset_id (bool): 是否在去重后重排id。
506
515
  Returns:
507
516
  Dict[str, Tuple[int, int]]: {表名: (重复组数, 实际删除行数)}。
508
517
  """
@@ -531,7 +540,7 @@ class MySQLDeduplicator:
531
540
  logger.debug('提交表去重任务', {'库': database, '表': table, 'columns': columns})
532
541
  futures[executor.submit(
533
542
  self.deduplicate_table,
534
- database, table, columns, dry_run
543
+ database, table, columns, dry_run, reset_id
535
544
  )] = table
536
545
  for future in concurrent.futures.as_completed(futures):
537
546
  table = futures[future]
@@ -547,7 +556,7 @@ class MySQLDeduplicator:
547
556
  for table in target_tables:
548
557
  columns = columns_map.get(table) if columns_map else None
549
558
  dup_count, affected_rows = self.deduplicate_table(
550
- database, table, columns, dry_run
559
+ database, table, columns, dry_run, reset_id
551
560
  )
552
561
  results[table] = (dup_count, affected_rows)
553
562
  total_dup = sum(r[0] for r in results.values())
@@ -564,7 +573,8 @@ class MySQLDeduplicator:
564
573
  tables_map: Optional[Dict[str, List[str]]] = None,
565
574
  columns_map: Optional[Dict[str, Dict[str, List[str]]]] = None,
566
575
  dry_run: bool = False,
567
- parallel: bool = False
576
+ parallel: bool = False,
577
+ reset_id: bool = False
568
578
  ) -> Dict[str, Dict[str, Tuple[int, int]]]:
569
579
  """
570
580
  对所有数据库进行去重。
@@ -575,6 +585,7 @@ class MySQLDeduplicator:
575
585
  columns_map (Optional[Dict[str, Dict[str, List[str]]]]): 指定每个表去重时使用的列,格式为 {数据库名: {表名: [列名, ...]}}。如果为 None,则使用所有列。
576
586
  dry_run (bool): 是否为模拟运行模式。为 True 时只统计重复行数,不实际删除。
577
587
  parallel (bool): 是否并行处理多个数据库。为 True 时使用线程池并发处理。
588
+ reset_id (bool): 是否在去重后重排id。
578
589
  Returns:
579
590
  Dict[str, Dict[str, Tuple[int, int]]]: 嵌套字典,格式为 {数据库名: {表名: (重复组数, 实际删除行数)}}。
580
591
  """
@@ -598,7 +609,7 @@ class MySQLDeduplicator:
598
609
  db_columns_map = columns_map.get(db) if columns_map else None
599
610
  futures[executor.submit(
600
611
  self.deduplicate_database,
601
- db, tables, db_columns_map, dry_run, False
612
+ db, tables, db_columns_map, dry_run, False, reset_id
602
613
  )] = db
603
614
  for future in concurrent.futures.as_completed(futures):
604
615
  db = futures[future]
@@ -614,7 +625,7 @@ class MySQLDeduplicator:
614
625
  tables = tables_map.get(db) if tables_map else None
615
626
  db_columns_map = columns_map.get(db) if columns_map else None
616
627
  db_results = self.deduplicate_database(
617
- db, tables, db_columns_map, dry_run, parallel
628
+ db, tables, db_columns_map, dry_run, parallel, reset_id
618
629
  )
619
630
  all_results[db] = db_results
620
631
  total_dup = sum(
@@ -709,6 +720,146 @@ class MySQLDeduplicator:
709
720
  """
710
721
  self.close()
711
722
 
723
+ def _has_foreign_key_dependency(self, database: str, table: str, pk: str) -> bool:
724
+ """检测id列是否被其他表外键引用。"""
725
+ fk_check_sql = '''
726
+ SELECT TABLE_NAME, COLUMN_NAME, CONSTRAINT_NAME, REFERENCED_TABLE_NAME, REFERENCED_COLUMN_NAME
727
+ FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
728
+ WHERE REFERENCED_TABLE_SCHEMA = %s AND REFERENCED_TABLE_NAME = %s AND REFERENCED_COLUMN_NAME = %s
729
+ '''
730
+ with self._get_connection() as conn:
731
+ with conn.cursor() as cursor:
732
+ cursor.execute(f"USE `{database}`")
733
+ cursor.execute(fk_check_sql, (database, table, pk))
734
+ fk_rows = cursor.fetchall()
735
+ return bool(fk_rows)
736
+
737
+ def _get_table_create_sql_and_pk(self, database: str, table: str) -> tuple:
738
+ """获取表的CREATE语句和主键字段列表。"""
739
+ with self._get_connection() as conn:
740
+ with conn.cursor() as cursor:
741
+ cursor.execute(f"USE `{database}`")
742
+ cursor.execute(f"SHOW CREATE TABLE `{database}`.`{table}`")
743
+ create_sql = cursor.fetchone()['Create Table']
744
+ cursor.execute(f"SHOW KEYS FROM `{database}`.`{table}` WHERE Key_name = 'PRIMARY'")
745
+ pk_rows = cursor.fetchall()
746
+ pk_columns = [row['Column_name'] for row in pk_rows]
747
+ return create_sql, pk_columns
748
+
749
+ def _make_temp_table_sql(self, create_sql: str, table: str, temp_table: str, pk: str, pk_columns: list) -> str:
750
+ """生成临时表的CREATE语句,仅替换id字段类型。"""
751
+ def replace_id_type(sql):
752
+ lines = sql.split('\n')
753
+ new_lines = []
754
+ for line in lines:
755
+ if re.match(rf'\s*`{pk}` ', line):
756
+ if pk_columns == [pk]:
757
+ line = re.sub(r'`' + pk + r'`\s+[^,]*', f'`{pk}` INT NOT NULL AUTO_INCREMENT', line)
758
+ else:
759
+ line = re.sub(r'`' + pk + r'`\s+[^,]*', f'`{pk}` INT NOT NULL', line)
760
+ new_lines.append(line)
761
+ return '\n'.join(new_lines)
762
+ create_sql_temp = re.sub(
763
+ rf'CREATE TABLE `{table}`',
764
+ f'CREATE TABLE `{temp_table}`',
765
+ create_sql,
766
+ count=1
767
+ )
768
+ create_sql_temp = replace_id_type(create_sql_temp)
769
+ create_sql_temp = re.sub(r'AUTO_INCREMENT=\d+', '', create_sql_temp)
770
+ return create_sql_temp
771
+
772
+ def _create_and_fill_temp_table(self, database: str, table: str, temp_table: str, pk: str) -> list:
773
+ """创建临时表并插入重排id数据,返回所有字段名。"""
774
+ with self._get_connection() as conn:
775
+ with conn.cursor() as cursor:
776
+ cursor.execute(f"USE `{database}`")
777
+ cursor.execute(f"SHOW COLUMNS FROM `{database}`.`{table}`")
778
+ columns = [row['Field'] for row in cursor.fetchall()]
779
+ columns_wo_id = [col for col in columns if col != pk]
780
+ col_list = ', '.join([f'`{col}`' for col in columns_wo_id])
781
+ insert_sql = f"INSERT INTO `{database}`.`{temp_table}` ({col_list}, `{pk}`) SELECT {col_list}, (@rownum:=@rownum+1) as `{pk}` FROM `{database}`.`{table}` JOIN (SELECT @rownum:=0) r ORDER BY `{pk}` ASC"
782
+ cursor.execute(insert_sql)
783
+ return columns
784
+
785
+ def _swap_tables_with_backup(self, database: str, table: str, temp_table: str, bak_table: str):
786
+ """原表重命名为备份,临时表变原表名。"""
787
+ with self._get_connection() as conn:
788
+ with conn.cursor() as cursor:
789
+ cursor.execute(f"USE `{database}`")
790
+ cursor.execute(f"RENAME TABLE `{database}`.`{table}` TO `{database}`.`{bak_table}`")
791
+ cursor.execute(f"RENAME TABLE `{database}`.`{temp_table}` TO `{database}`.`{table}`")
792
+ conn.commit()
793
+
794
+ def _check_and_cleanup_backup(self, database: str, table: str, bak_table: str) -> bool:
795
+ """校验新表和备份表数据量一致,安全删除备份表。"""
796
+ with self._get_connection() as conn:
797
+ with conn.cursor() as cursor:
798
+ cursor.execute(f"USE `{database}`")
799
+ cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`")
800
+ new_cnt = cursor.fetchone()['cnt']
801
+ cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{bak_table}`")
802
+ old_cnt = cursor.fetchone()['cnt']
803
+ if new_cnt == old_cnt:
804
+ cursor.execute(f"DROP TABLE `{database}`.`{bak_table}`")
805
+ conn.commit()
806
+ return True
807
+ else:
808
+ logger.error('id重排后数据量不一致,未删除备份表', {'库': database, '表': table, '新表行数': new_cnt, '备份表行数': old_cnt})
809
+ return False
810
+
811
+ def _rollback_table_swap(self, database: str, table: str, bak_table: str):
812
+ """回滚:如bak表存在且原表不存在,则恢复原表名。"""
813
+ try:
814
+ with self._get_connection() as conn:
815
+ with conn.cursor() as cursor:
816
+ cursor.execute(f"USE `{database}`")
817
+ cursor.execute(f"SHOW TABLES LIKE '{bak_table}'")
818
+ if cursor.fetchone():
819
+ cursor.execute(f"SHOW TABLES LIKE '{table}'")
820
+ if not cursor.fetchone():
821
+ cursor.execute(f"RENAME TABLE `{database}`.`{bak_table}` TO `{database}`.`{table}`")
822
+ conn.commit()
823
+ logger.info('回滚成功,已恢复原表', {'库': database, '表': table})
824
+ except Exception as e2:
825
+ logger.error('回滚失败', {'库': database, '表': table, '异常': str(e2)})
826
+
827
+ def _reset_id_column(self, database: str, table: str) -> bool:
828
+ pk = self.primary_key
829
+ temp_table = f"temp_{table}_resetid_{os.getpid()}_{threading.get_ident()}"
830
+ temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)[:60]
831
+ bak_table = f"{table}_bak_{int(time.time())}"
832
+ try:
833
+ # 1. 检查外键依赖
834
+ if self._has_foreign_key_dependency(database, table, pk):
835
+ logger.warning('存在外键依赖,拒绝重排id', {'库': database, '表': table})
836
+ return False
837
+ # 2. 获取表结构和主键
838
+ create_sql, pk_columns = self._get_table_create_sql_and_pk(database, table)
839
+ # 3. 生成临时表DDL
840
+ create_sql_temp = self._make_temp_table_sql(create_sql, table, temp_table, pk, pk_columns)
841
+ # 4. 创建临时表
842
+ with self._get_connection() as conn:
843
+ with conn.cursor() as cursor:
844
+ cursor.execute(f"USE `{database}`")
845
+ cursor.execute(f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`")
846
+ cursor.execute(create_sql_temp)
847
+ conn.commit()
848
+ # 5. 填充临时表
849
+ self._create_and_fill_temp_table(database, table, temp_table, pk)
850
+ # 6. 表交换
851
+ self._swap_tables_with_backup(database, table, temp_table, bak_table)
852
+ # 7. 校验和清理
853
+ if self._check_and_cleanup_backup(database, table, bak_table):
854
+ logger.info('id重排完成并安全删除备份表,主键信息已保留', {'库': database, '表': table})
855
+ return True
856
+ else:
857
+ return False
858
+ except Exception as e:
859
+ logger.error('id重排失败,尝试回滚', {'库': database, '表': table, '异常': str(e)})
860
+ self._rollback_table_swap(database, table, bak_table)
861
+ return False
862
+
712
863
 
713
864
  def main():
714
865
  deduplicator = MySQLDeduplicator(
@@ -719,18 +870,18 @@ def main():
719
870
  )
720
871
 
721
872
  # 全库去重(单线程)
722
- deduplicator.deduplicate_all(dry_run=False, parallel=False)
873
+ deduplicator.deduplicate_all(dry_run=False, parallel=False, reset_id=False)
723
874
 
724
875
  # # 指定数据库去重(多线程)
725
876
  # logger.info('调用deduplicate_database')
726
- # deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True)
877
+ # deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True, reset_id=False)
727
878
 
728
879
  # # 指定表去重(使用特定列)
729
- # deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False)
880
+ # deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False, reset_id=False)
730
881
 
731
882
  # 关闭连接
732
883
  deduplicator.close()
733
884
 
734
885
  if __name__ == '__main__':
735
- # main()
886
+ main()
736
887
  pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 3.11.5
3
+ Version: 3.11.6
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,5 +1,5 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=Ne7M_nVkNZ4zJcEt-_2BhpBw01sgUpRFBru080buHJk,18
2
+ mdbq/__version__.py,sha256=jUdj4-uaa03JUoNnXK_fTx_XQfDwjeFprE71R3ZenRY,18
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
4
  mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
5
5
  mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
@@ -8,7 +8,7 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
8
8
  mdbq/log/mylogger.py,sha256=HuxLBCXjm6fZrxYE0rdpUCz359WGeqOX0vvg9jTuRY4,24126
9
9
  mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
10
10
  mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
11
- mdbq/mysql/deduplicator.py,sha256=bIV010UkFfSUONY6-756x3tDVO4k6q3pqxoY3Z2xT-k,32990
11
+ mdbq/mysql/deduplicator.py,sha256=XSAgt6HqvzDyZSiv4mHli5fA3p3ePn5g3HupqI2cyVo,41444
12
12
  mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
13
13
  mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
14
14
  mdbq/mysql/uploader.py,sha256=LxPlAfSNhQbLu-or4wxa-vLjCw5_PIN3ZVoksWUJazQ,61701
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
24
24
  mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
25
25
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
26
26
  mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
27
- mdbq-3.11.5.dist-info/METADATA,sha256=OxaQPCqcPoi2t8FFpw2eCKnde3B3ci-665xalC4GZT0,364
28
- mdbq-3.11.5.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
- mdbq-3.11.5.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
- mdbq-3.11.5.dist-info/RECORD,,
27
+ mdbq-3.11.6.dist-info/METADATA,sha256=gEcCBqGoPLhugYleGDv1r2YV_lHPL3AnGHD_dHTxY3Q,364
28
+ mdbq-3.11.6.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
+ mdbq-3.11.6.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
+ mdbq-3.11.6.dist-info/RECORD,,
File without changes