mdbq 3.11.5__py3-none-any.whl → 3.11.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/deduplicator.py +164 -13
- {mdbq-3.11.5.dist-info → mdbq-3.11.6.dist-info}/METADATA +1 -1
- {mdbq-3.11.5.dist-info → mdbq-3.11.6.dist-info}/RECORD +6 -6
- {mdbq-3.11.5.dist-info → mdbq-3.11.6.dist-info}/WHEEL +0 -0
- {mdbq-3.11.5.dist-info → mdbq-3.11.6.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.11.
|
1
|
+
VERSION = '3.11.6'
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -329,7 +329,8 @@ class MySQLDeduplicator:
|
|
329
329
|
database: str,
|
330
330
|
table: str,
|
331
331
|
columns: Optional[List[str]] = None,
|
332
|
-
dry_run: bool = False
|
332
|
+
dry_run: bool = False,
|
333
|
+
reset_id: bool = False
|
333
334
|
) -> Tuple[int, int]:
|
334
335
|
"""
|
335
336
|
执行单表去重。
|
@@ -339,6 +340,7 @@ class MySQLDeduplicator:
|
|
339
340
|
table (str): 表名。
|
340
341
|
columns (Optional[List[str]]): 用于去重的列名列表(为None时使用所有列)。
|
341
342
|
dry_run (bool): 是否为模拟运行(只统计不实际删除)。
|
343
|
+
reset_id (bool): 是否在去重后重排id。
|
342
344
|
Returns:
|
343
345
|
Tuple[int, int]: (重复组数, 实际删除行数)。
|
344
346
|
"""
|
@@ -430,6 +432,9 @@ class MySQLDeduplicator:
|
|
430
432
|
if batch_deleted < self.batch_size:
|
431
433
|
break
|
432
434
|
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns})
|
435
|
+
# 新增:去重后重排id
|
436
|
+
if reset_id and affected_rows > 0:
|
437
|
+
self._reset_id_column(database, table)
|
433
438
|
else:
|
434
439
|
logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组数": dup_count, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None})
|
435
440
|
affected_rows = 0
|
@@ -458,7 +463,8 @@ class MySQLDeduplicator:
|
|
458
463
|
database: str,
|
459
464
|
table: str,
|
460
465
|
columns: Optional[List[str]] = None,
|
461
|
-
dry_run: bool = False
|
466
|
+
dry_run: bool = False,
|
467
|
+
reset_id: bool = False
|
462
468
|
) -> Tuple[int, int]:
|
463
469
|
"""
|
464
470
|
对指定表进行去重。
|
@@ -468,6 +474,7 @@ class MySQLDeduplicator:
|
|
468
474
|
table (str): 表名。
|
469
475
|
columns (Optional[List[str]]): 用于去重的列名列表(为None时使用所有列)。
|
470
476
|
dry_run (bool): 是否为模拟运行(只统计不实际删除)。
|
477
|
+
reset_id (bool): 是否在去重后重排id。
|
471
478
|
Returns:
|
472
479
|
Tuple[int, int]: (重复组数, 实际删除行数)。
|
473
480
|
"""
|
@@ -479,7 +486,7 @@ class MySQLDeduplicator:
|
|
479
486
|
logger.warning('表不存在', {"库": database, "表": table, "warning": "跳过"})
|
480
487
|
return (0, 0)
|
481
488
|
logger.info('单表开始', {"库": database, "表": table, "参数": {"指定去重列": columns, "模拟运行": dry_run, '排除列': self.exclude_columns}})
|
482
|
-
result = self._deduplicate_table(database, table, columns, dry_run)
|
489
|
+
result = self._deduplicate_table(database, table, columns, dry_run, reset_id)
|
483
490
|
logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result})
|
484
491
|
return result
|
485
492
|
except Exception as e:
|
@@ -492,7 +499,8 @@ class MySQLDeduplicator:
|
|
492
499
|
tables: Optional[List[str]] = None,
|
493
500
|
columns_map: Optional[Dict[str, List[str]]] = None,
|
494
501
|
dry_run: bool = False,
|
495
|
-
parallel: bool = False
|
502
|
+
parallel: bool = False,
|
503
|
+
reset_id: bool = False
|
496
504
|
) -> Dict[str, Tuple[int, int]]:
|
497
505
|
"""
|
498
506
|
对指定数据库的所有表进行去重。
|
@@ -503,6 +511,7 @@ class MySQLDeduplicator:
|
|
503
511
|
columns_map (Optional[Dict[str, List[str]]]): 各表使用的去重列 {表名: [列名]}。
|
504
512
|
dry_run (bool): 是否为模拟运行。
|
505
513
|
parallel (bool): 是否并行处理。
|
514
|
+
reset_id (bool): 是否在去重后重排id。
|
506
515
|
Returns:
|
507
516
|
Dict[str, Tuple[int, int]]: {表名: (重复组数, 实际删除行数)}。
|
508
517
|
"""
|
@@ -531,7 +540,7 @@ class MySQLDeduplicator:
|
|
531
540
|
logger.debug('提交表去重任务', {'库': database, '表': table, 'columns': columns})
|
532
541
|
futures[executor.submit(
|
533
542
|
self.deduplicate_table,
|
534
|
-
database, table, columns, dry_run
|
543
|
+
database, table, columns, dry_run, reset_id
|
535
544
|
)] = table
|
536
545
|
for future in concurrent.futures.as_completed(futures):
|
537
546
|
table = futures[future]
|
@@ -547,7 +556,7 @@ class MySQLDeduplicator:
|
|
547
556
|
for table in target_tables:
|
548
557
|
columns = columns_map.get(table) if columns_map else None
|
549
558
|
dup_count, affected_rows = self.deduplicate_table(
|
550
|
-
database, table, columns, dry_run
|
559
|
+
database, table, columns, dry_run, reset_id
|
551
560
|
)
|
552
561
|
results[table] = (dup_count, affected_rows)
|
553
562
|
total_dup = sum(r[0] for r in results.values())
|
@@ -564,7 +573,8 @@ class MySQLDeduplicator:
|
|
564
573
|
tables_map: Optional[Dict[str, List[str]]] = None,
|
565
574
|
columns_map: Optional[Dict[str, Dict[str, List[str]]]] = None,
|
566
575
|
dry_run: bool = False,
|
567
|
-
parallel: bool = False
|
576
|
+
parallel: bool = False,
|
577
|
+
reset_id: bool = False
|
568
578
|
) -> Dict[str, Dict[str, Tuple[int, int]]]:
|
569
579
|
"""
|
570
580
|
对所有数据库进行去重。
|
@@ -575,6 +585,7 @@ class MySQLDeduplicator:
|
|
575
585
|
columns_map (Optional[Dict[str, Dict[str, List[str]]]]): 指定每个表去重时使用的列,格式为 {数据库名: {表名: [列名, ...]}}。如果为 None,则使用所有列。
|
576
586
|
dry_run (bool): 是否为模拟运行模式。为 True 时只统计重复行数,不实际删除。
|
577
587
|
parallel (bool): 是否并行处理多个数据库。为 True 时使用线程池并发处理。
|
588
|
+
reset_id (bool): 是否在去重后重排id。
|
578
589
|
Returns:
|
579
590
|
Dict[str, Dict[str, Tuple[int, int]]]: 嵌套字典,格式为 {数据库名: {表名: (重复组数, 实际删除行数)}}。
|
580
591
|
"""
|
@@ -598,7 +609,7 @@ class MySQLDeduplicator:
|
|
598
609
|
db_columns_map = columns_map.get(db) if columns_map else None
|
599
610
|
futures[executor.submit(
|
600
611
|
self.deduplicate_database,
|
601
|
-
db, tables, db_columns_map, dry_run, False
|
612
|
+
db, tables, db_columns_map, dry_run, False, reset_id
|
602
613
|
)] = db
|
603
614
|
for future in concurrent.futures.as_completed(futures):
|
604
615
|
db = futures[future]
|
@@ -614,7 +625,7 @@ class MySQLDeduplicator:
|
|
614
625
|
tables = tables_map.get(db) if tables_map else None
|
615
626
|
db_columns_map = columns_map.get(db) if columns_map else None
|
616
627
|
db_results = self.deduplicate_database(
|
617
|
-
db, tables, db_columns_map, dry_run, parallel
|
628
|
+
db, tables, db_columns_map, dry_run, parallel, reset_id
|
618
629
|
)
|
619
630
|
all_results[db] = db_results
|
620
631
|
total_dup = sum(
|
@@ -709,6 +720,146 @@ class MySQLDeduplicator:
|
|
709
720
|
"""
|
710
721
|
self.close()
|
711
722
|
|
723
|
+
def _has_foreign_key_dependency(self, database: str, table: str, pk: str) -> bool:
|
724
|
+
"""检测id列是否被其他表外键引用。"""
|
725
|
+
fk_check_sql = '''
|
726
|
+
SELECT TABLE_NAME, COLUMN_NAME, CONSTRAINT_NAME, REFERENCED_TABLE_NAME, REFERENCED_COLUMN_NAME
|
727
|
+
FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
|
728
|
+
WHERE REFERENCED_TABLE_SCHEMA = %s AND REFERENCED_TABLE_NAME = %s AND REFERENCED_COLUMN_NAME = %s
|
729
|
+
'''
|
730
|
+
with self._get_connection() as conn:
|
731
|
+
with conn.cursor() as cursor:
|
732
|
+
cursor.execute(f"USE `{database}`")
|
733
|
+
cursor.execute(fk_check_sql, (database, table, pk))
|
734
|
+
fk_rows = cursor.fetchall()
|
735
|
+
return bool(fk_rows)
|
736
|
+
|
737
|
+
def _get_table_create_sql_and_pk(self, database: str, table: str) -> tuple:
|
738
|
+
"""获取表的CREATE语句和主键字段列表。"""
|
739
|
+
with self._get_connection() as conn:
|
740
|
+
with conn.cursor() as cursor:
|
741
|
+
cursor.execute(f"USE `{database}`")
|
742
|
+
cursor.execute(f"SHOW CREATE TABLE `{database}`.`{table}`")
|
743
|
+
create_sql = cursor.fetchone()['Create Table']
|
744
|
+
cursor.execute(f"SHOW KEYS FROM `{database}`.`{table}` WHERE Key_name = 'PRIMARY'")
|
745
|
+
pk_rows = cursor.fetchall()
|
746
|
+
pk_columns = [row['Column_name'] for row in pk_rows]
|
747
|
+
return create_sql, pk_columns
|
748
|
+
|
749
|
+
def _make_temp_table_sql(self, create_sql: str, table: str, temp_table: str, pk: str, pk_columns: list) -> str:
|
750
|
+
"""生成临时表的CREATE语句,仅替换id字段类型。"""
|
751
|
+
def replace_id_type(sql):
|
752
|
+
lines = sql.split('\n')
|
753
|
+
new_lines = []
|
754
|
+
for line in lines:
|
755
|
+
if re.match(rf'\s*`{pk}` ', line):
|
756
|
+
if pk_columns == [pk]:
|
757
|
+
line = re.sub(r'`' + pk + r'`\s+[^,]*', f'`{pk}` INT NOT NULL AUTO_INCREMENT', line)
|
758
|
+
else:
|
759
|
+
line = re.sub(r'`' + pk + r'`\s+[^,]*', f'`{pk}` INT NOT NULL', line)
|
760
|
+
new_lines.append(line)
|
761
|
+
return '\n'.join(new_lines)
|
762
|
+
create_sql_temp = re.sub(
|
763
|
+
rf'CREATE TABLE `{table}`',
|
764
|
+
f'CREATE TABLE `{temp_table}`',
|
765
|
+
create_sql,
|
766
|
+
count=1
|
767
|
+
)
|
768
|
+
create_sql_temp = replace_id_type(create_sql_temp)
|
769
|
+
create_sql_temp = re.sub(r'AUTO_INCREMENT=\d+', '', create_sql_temp)
|
770
|
+
return create_sql_temp
|
771
|
+
|
772
|
+
def _create_and_fill_temp_table(self, database: str, table: str, temp_table: str, pk: str) -> list:
|
773
|
+
"""创建临时表并插入重排id数据,返回所有字段名。"""
|
774
|
+
with self._get_connection() as conn:
|
775
|
+
with conn.cursor() as cursor:
|
776
|
+
cursor.execute(f"USE `{database}`")
|
777
|
+
cursor.execute(f"SHOW COLUMNS FROM `{database}`.`{table}`")
|
778
|
+
columns = [row['Field'] for row in cursor.fetchall()]
|
779
|
+
columns_wo_id = [col for col in columns if col != pk]
|
780
|
+
col_list = ', '.join([f'`{col}`' for col in columns_wo_id])
|
781
|
+
insert_sql = f"INSERT INTO `{database}`.`{temp_table}` ({col_list}, `{pk}`) SELECT {col_list}, (@rownum:=@rownum+1) as `{pk}` FROM `{database}`.`{table}` JOIN (SELECT @rownum:=0) r ORDER BY `{pk}` ASC"
|
782
|
+
cursor.execute(insert_sql)
|
783
|
+
return columns
|
784
|
+
|
785
|
+
def _swap_tables_with_backup(self, database: str, table: str, temp_table: str, bak_table: str):
|
786
|
+
"""原表重命名为备份,临时表变原表名。"""
|
787
|
+
with self._get_connection() as conn:
|
788
|
+
with conn.cursor() as cursor:
|
789
|
+
cursor.execute(f"USE `{database}`")
|
790
|
+
cursor.execute(f"RENAME TABLE `{database}`.`{table}` TO `{database}`.`{bak_table}`")
|
791
|
+
cursor.execute(f"RENAME TABLE `{database}`.`{temp_table}` TO `{database}`.`{table}`")
|
792
|
+
conn.commit()
|
793
|
+
|
794
|
+
def _check_and_cleanup_backup(self, database: str, table: str, bak_table: str) -> bool:
|
795
|
+
"""校验新表和备份表数据量一致,安全删除备份表。"""
|
796
|
+
with self._get_connection() as conn:
|
797
|
+
with conn.cursor() as cursor:
|
798
|
+
cursor.execute(f"USE `{database}`")
|
799
|
+
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`")
|
800
|
+
new_cnt = cursor.fetchone()['cnt']
|
801
|
+
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{bak_table}`")
|
802
|
+
old_cnt = cursor.fetchone()['cnt']
|
803
|
+
if new_cnt == old_cnt:
|
804
|
+
cursor.execute(f"DROP TABLE `{database}`.`{bak_table}`")
|
805
|
+
conn.commit()
|
806
|
+
return True
|
807
|
+
else:
|
808
|
+
logger.error('id重排后数据量不一致,未删除备份表', {'库': database, '表': table, '新表行数': new_cnt, '备份表行数': old_cnt})
|
809
|
+
return False
|
810
|
+
|
811
|
+
def _rollback_table_swap(self, database: str, table: str, bak_table: str):
|
812
|
+
"""回滚:如bak表存在且原表不存在,则恢复原表名。"""
|
813
|
+
try:
|
814
|
+
with self._get_connection() as conn:
|
815
|
+
with conn.cursor() as cursor:
|
816
|
+
cursor.execute(f"USE `{database}`")
|
817
|
+
cursor.execute(f"SHOW TABLES LIKE '{bak_table}'")
|
818
|
+
if cursor.fetchone():
|
819
|
+
cursor.execute(f"SHOW TABLES LIKE '{table}'")
|
820
|
+
if not cursor.fetchone():
|
821
|
+
cursor.execute(f"RENAME TABLE `{database}`.`{bak_table}` TO `{database}`.`{table}`")
|
822
|
+
conn.commit()
|
823
|
+
logger.info('回滚成功,已恢复原表', {'库': database, '表': table})
|
824
|
+
except Exception as e2:
|
825
|
+
logger.error('回滚失败', {'库': database, '表': table, '异常': str(e2)})
|
826
|
+
|
827
|
+
def _reset_id_column(self, database: str, table: str) -> bool:
|
828
|
+
pk = self.primary_key
|
829
|
+
temp_table = f"temp_{table}_resetid_{os.getpid()}_{threading.get_ident()}"
|
830
|
+
temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)[:60]
|
831
|
+
bak_table = f"{table}_bak_{int(time.time())}"
|
832
|
+
try:
|
833
|
+
# 1. 检查外键依赖
|
834
|
+
if self._has_foreign_key_dependency(database, table, pk):
|
835
|
+
logger.warning('存在外键依赖,拒绝重排id', {'库': database, '表': table})
|
836
|
+
return False
|
837
|
+
# 2. 获取表结构和主键
|
838
|
+
create_sql, pk_columns = self._get_table_create_sql_and_pk(database, table)
|
839
|
+
# 3. 生成临时表DDL
|
840
|
+
create_sql_temp = self._make_temp_table_sql(create_sql, table, temp_table, pk, pk_columns)
|
841
|
+
# 4. 创建临时表
|
842
|
+
with self._get_connection() as conn:
|
843
|
+
with conn.cursor() as cursor:
|
844
|
+
cursor.execute(f"USE `{database}`")
|
845
|
+
cursor.execute(f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`")
|
846
|
+
cursor.execute(create_sql_temp)
|
847
|
+
conn.commit()
|
848
|
+
# 5. 填充临时表
|
849
|
+
self._create_and_fill_temp_table(database, table, temp_table, pk)
|
850
|
+
# 6. 表交换
|
851
|
+
self._swap_tables_with_backup(database, table, temp_table, bak_table)
|
852
|
+
# 7. 校验和清理
|
853
|
+
if self._check_and_cleanup_backup(database, table, bak_table):
|
854
|
+
logger.info('id重排完成并安全删除备份表,主键信息已保留', {'库': database, '表': table})
|
855
|
+
return True
|
856
|
+
else:
|
857
|
+
return False
|
858
|
+
except Exception as e:
|
859
|
+
logger.error('id重排失败,尝试回滚', {'库': database, '表': table, '异常': str(e)})
|
860
|
+
self._rollback_table_swap(database, table, bak_table)
|
861
|
+
return False
|
862
|
+
|
712
863
|
|
713
864
|
def main():
|
714
865
|
deduplicator = MySQLDeduplicator(
|
@@ -719,18 +870,18 @@ def main():
|
|
719
870
|
)
|
720
871
|
|
721
872
|
# 全库去重(单线程)
|
722
|
-
deduplicator.deduplicate_all(dry_run=False, parallel=False)
|
873
|
+
deduplicator.deduplicate_all(dry_run=False, parallel=False, reset_id=False)
|
723
874
|
|
724
875
|
# # 指定数据库去重(多线程)
|
725
876
|
# logger.info('调用deduplicate_database')
|
726
|
-
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True)
|
877
|
+
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True, reset_id=False)
|
727
878
|
|
728
879
|
# # 指定表去重(使用特定列)
|
729
|
-
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False)
|
880
|
+
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False, reset_id=False)
|
730
881
|
|
731
882
|
# 关闭连接
|
732
883
|
deduplicator.close()
|
733
884
|
|
734
885
|
if __name__ == '__main__':
|
735
|
-
|
886
|
+
main()
|
736
887
|
pass
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=jUdj4-uaa03JUoNnXK_fTx_XQfDwjeFprE71R3ZenRY,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -8,7 +8,7 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
|
8
8
|
mdbq/log/mylogger.py,sha256=HuxLBCXjm6fZrxYE0rdpUCz359WGeqOX0vvg9jTuRY4,24126
|
9
9
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
|
-
mdbq/mysql/deduplicator.py,sha256=
|
11
|
+
mdbq/mysql/deduplicator.py,sha256=XSAgt6HqvzDyZSiv4mHli5fA3p3ePn5g3HupqI2cyVo,41444
|
12
12
|
mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
|
13
13
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
14
14
|
mdbq/mysql/uploader.py,sha256=LxPlAfSNhQbLu-or4wxa-vLjCw5_PIN3ZVoksWUJazQ,61701
|
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
24
24
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
25
25
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
26
26
|
mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
|
27
|
-
mdbq-3.11.
|
28
|
-
mdbq-3.11.
|
29
|
-
mdbq-3.11.
|
30
|
-
mdbq-3.11.
|
27
|
+
mdbq-3.11.6.dist-info/METADATA,sha256=gEcCBqGoPLhugYleGDv1r2YV_lHPL3AnGHD_dHTxY3Q,364
|
28
|
+
mdbq-3.11.6.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-3.11.6.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-3.11.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|