mdbq 3.11.9__py3-none-any.whl → 3.11.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/deduplicator.py +157 -54
- {mdbq-3.11.9.dist-info → mdbq-3.11.10.dist-info}/METADATA +1 -1
- {mdbq-3.11.9.dist-info → mdbq-3.11.10.dist-info}/RECORD +6 -6
- {mdbq-3.11.9.dist-info → mdbq-3.11.10.dist-info}/WHEEL +0 -0
- {mdbq-3.11.9.dist-info → mdbq-3.11.10.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.11.
|
1
|
+
VERSION = '3.11.10'
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -37,7 +37,7 @@ class MySQLDeduplicator:
|
|
37
37
|
|
38
38
|
功能:
|
39
39
|
1. 自动检测并删除MySQL数据库中的重复数据
|
40
|
-
2.
|
40
|
+
2. 支持全库扫描或指定表理
|
41
41
|
3. 支持多线程/多进程安全处理
|
42
42
|
4. 完善的错误处理和日志记录
|
43
43
|
|
@@ -215,7 +215,7 @@ class MySQLDeduplicator:
|
|
215
215
|
last_exception = None
|
216
216
|
for attempt in range(self.max_retries + 1):
|
217
217
|
try:
|
218
|
-
logger.debug(f'调用{func.__name__},第{attempt+1}
|
218
|
+
logger.debug(f'调用{func.__name__},第{attempt+1}次连接', {'args': args, 'kwargs': kwargs})
|
219
219
|
return func(self, *args, **kwargs)
|
220
220
|
except (pymysql.OperationalError, pymysql.InterfaceError) as e:
|
221
221
|
last_exception = e
|
@@ -362,13 +362,59 @@ class MySQLDeduplicator:
|
|
362
362
|
else:
|
363
363
|
logger.debug('date_column已存在索引', {"库": database, "表": table, "date_column": date_column})
|
364
364
|
|
365
|
+
def _row_generator(self, database, table, select_cols, select_where, batch_size=10000):
|
366
|
+
"""
|
367
|
+
生成器:分批拉取表数据,避免一次性加载全部数据到内存。
|
368
|
+
Args:
|
369
|
+
database (str): 数据库名。
|
370
|
+
table (str): 表名。
|
371
|
+
select_cols (str): 选择的列字符串。
|
372
|
+
select_where (str): where条件字符串。
|
373
|
+
batch_size (int): 每批拉取的行数。
|
374
|
+
Yields:
|
375
|
+
dict: 每行数据。
|
376
|
+
"""
|
377
|
+
offset = 0
|
378
|
+
while True:
|
379
|
+
sql = f"SELECT {select_cols} FROM `{database}`.`{table}` {select_where} LIMIT {batch_size} OFFSET {offset}"
|
380
|
+
with self._get_connection() as conn:
|
381
|
+
with conn.cursor() as cursor:
|
382
|
+
cursor.execute(sql)
|
383
|
+
rows = cursor.fetchall()
|
384
|
+
if not rows:
|
385
|
+
break
|
386
|
+
for row in rows:
|
387
|
+
yield row
|
388
|
+
if len(rows) < batch_size:
|
389
|
+
break
|
390
|
+
offset += batch_size
|
391
|
+
|
392
|
+
def _get_all_dates(self, database: str, table: str, date_column: str) -> list:
|
393
|
+
"""
|
394
|
+
获取表中所有不同的日期分区(按天)。
|
395
|
+
Args:
|
396
|
+
database (str): 数据库名。
|
397
|
+
table (str): 表名。
|
398
|
+
date_column (str): 日期列名。
|
399
|
+
Returns:
|
400
|
+
List: 所有不同的日期(字符串)。
|
401
|
+
"""
|
402
|
+
sql = f"SELECT DISTINCT `{date_column}` FROM `{database}`.`{table}` ORDER BY `{date_column}` ASC"
|
403
|
+
with self._get_connection() as conn:
|
404
|
+
with conn.cursor() as cursor:
|
405
|
+
cursor.execute(sql)
|
406
|
+
return [row[date_column] for row in cursor.fetchall() if row[date_column] is not None]
|
407
|
+
|
365
408
|
def _deduplicate_table(
|
366
409
|
self,
|
367
410
|
database: str,
|
368
411
|
table: str,
|
369
412
|
columns: Optional[List[str]] = None,
|
370
413
|
dry_run: bool = False,
|
371
|
-
use_python_dedup: bool = False
|
414
|
+
use_python_dedup: bool = False,
|
415
|
+
dedup_start_date: Optional[str] = None,
|
416
|
+
dedup_end_date: Optional[str] = None,
|
417
|
+
lock_table: bool = True
|
372
418
|
) -> Tuple[int, int]:
|
373
419
|
"""
|
374
420
|
执行单表去重。
|
@@ -376,7 +422,7 @@ class MySQLDeduplicator:
|
|
376
422
|
如果date_column在exclude_columns中,直接跳过该表。
|
377
423
|
优化:分批删除时用主键、避免重复建/删临时表、并发处理每天。
|
378
424
|
"""
|
379
|
-
if not self._acquire_table_lock(database, table):
|
425
|
+
if lock_table and not self._acquire_table_lock(database, table):
|
380
426
|
return (0, 0)
|
381
427
|
temp_table = None
|
382
428
|
try:
|
@@ -393,53 +439,73 @@ class MySQLDeduplicator:
|
|
393
439
|
# 2. 判断表是否包含date_column
|
394
440
|
has_time_col = time_col_lower in all_columns_lower if time_col_lower else False
|
395
441
|
# 如果包含date_column,自动检查并创建索引
|
396
|
-
if has_time_col:
|
442
|
+
if has_time_col and dedup_start_date is None and dedup_end_date is None:
|
397
443
|
self._ensure_index(database, table, time_col)
|
398
|
-
|
444
|
+
# 按天分区多线程处理
|
445
|
+
all_dates = self._get_all_dates(database, table, time_col)
|
446
|
+
total_dup = 0
|
447
|
+
total_del = 0
|
448
|
+
def process_date(date_val):
|
449
|
+
try:
|
450
|
+
logger.debug('按天分区去重', {"库": database, "表": table, "日期": date_val})
|
451
|
+
dup_count, affected_rows = self._deduplicate_table(
|
452
|
+
database, table, columns, dry_run, use_python_dedup,
|
453
|
+
dedup_start_date=date_val, dedup_end_date=date_val,
|
454
|
+
lock_table=False
|
455
|
+
)
|
456
|
+
return (dup_count, affected_rows, date_val, None)
|
457
|
+
except Exception as e:
|
458
|
+
logger.error('分区去重异常', {"库": database, "表": table, "日期": date_val, "异常": str(e), "func": sys._getframe().f_code.co_name})
|
459
|
+
return (0, 0, date_val, str(e))
|
460
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
461
|
+
future_to_date = {executor.submit(process_date, date_val): date_val for date_val in all_dates}
|
462
|
+
for future in concurrent.futures.as_completed(future_to_date):
|
463
|
+
dup_count, affected_rows, date_val, err = future.result()
|
464
|
+
if err:
|
465
|
+
logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
|
466
|
+
total_dup += dup_count
|
467
|
+
total_del += affected_rows
|
468
|
+
return (total_dup, total_del)
|
469
|
+
# 获取去重列
|
399
470
|
use_columns = columns or all_columns
|
400
471
|
use_columns = [col for col in use_columns if col.lower() in all_columns_lower and col.lower() not in exclude_columns_lower]
|
401
472
|
invalid_columns = set([col for col in (columns or []) if col.lower() not in all_columns_lower])
|
402
473
|
if invalid_columns:
|
403
474
|
logger.warning('不存在的列', {"库": database, "表": table, "不存在以下列": invalid_columns, 'func': sys._getframe().f_code.co_name})
|
404
475
|
if not use_columns:
|
405
|
-
logger.error('没有有效的去重列', {"库": database, "表": table})
|
476
|
+
logger.error('没有有效的去重列', {"库": database, "表": table, "func": sys._getframe().f_code.co_name})
|
406
477
|
return (0, 0)
|
407
478
|
pk = self.primary_key
|
408
479
|
pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
|
409
480
|
# 判断是否需要加日期区间条件
|
410
481
|
where_sql = ''
|
411
|
-
if has_time_col and
|
412
|
-
where_sql = f"t.`{time_col}` >= '{
|
482
|
+
if has_time_col and dedup_start_date and dedup_end_date:
|
483
|
+
where_sql = f"t.`{time_col}` >= '{dedup_start_date}' AND t.`{time_col}` <= '{dedup_end_date}'"
|
413
484
|
# 获取原始数据总量(只统计区间内数据)
|
414
485
|
with self._get_connection() as conn:
|
415
486
|
with conn.cursor() as cursor:
|
416
|
-
count_where = f"WHERE `{time_col}` >= '{
|
487
|
+
count_where = f"WHERE `{time_col}` >= '{dedup_start_date}' AND `{time_col}` <= '{dedup_end_date}'" if has_time_col and dedup_start_date and dedup_end_date else ''
|
417
488
|
count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}` {count_where}"
|
418
489
|
logger.debug('执行SQL', {'sql': count_sql})
|
419
490
|
cursor.execute(count_sql)
|
420
491
|
total_count_row = cursor.fetchone()
|
421
492
|
total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
|
422
|
-
logger.info('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name})
|
493
|
+
logger.info('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name, "数据日期": dedup_end_date})
|
423
494
|
column_list = ', '.join([f'`{col}`' for col in use_columns])
|
424
495
|
|
425
496
|
# 用Python查找重复
|
426
497
|
if use_python_dedup:
|
427
|
-
|
428
|
-
# 1. 拉取所有数据
|
498
|
+
# 1. 拉取所有数据(生成器分批拉取)
|
429
499
|
select_cols = f'`{pk_real}`,' + ','.join([f'`{col}`' for col in use_columns])
|
430
|
-
select_where = f"WHERE `{time_col}` >= '{
|
500
|
+
select_where = f"WHERE `{time_col}` >= '{dedup_start_date}' AND `{time_col}` <= '{dedup_end_date}'" if has_time_col and dedup_start_date and dedup_end_date else ''
|
431
501
|
select_sql = f"SELECT {select_cols} FROM `{database}`.`{table}` {select_where}"
|
432
502
|
logger.debug('用Python查找重复,拉取数据SQL', {'sql': select_sql})
|
433
|
-
|
434
|
-
with conn.cursor() as cursor:
|
435
|
-
cursor.execute(select_sql)
|
436
|
-
rows = cursor.fetchall()
|
437
|
-
# 2. 分组找重复
|
503
|
+
# 用生成器分批拉取
|
438
504
|
grouped = defaultdict(list)
|
439
|
-
for row in
|
505
|
+
for row in self._row_generator(database, table, select_cols, select_where, self.batch_size):
|
440
506
|
key = tuple(row[col] for col in use_columns)
|
441
507
|
grouped[key].append(row[pk_real])
|
442
|
-
#
|
508
|
+
# 2. 统计重复组和待删除id
|
443
509
|
dup_count = 0
|
444
510
|
del_ids = []
|
445
511
|
for ids in grouped.values():
|
@@ -459,13 +525,13 @@ class MySQLDeduplicator:
|
|
459
525
|
batch_deleted = cursor.rowcount
|
460
526
|
affected_rows += batch_deleted
|
461
527
|
conn.commit()
|
462
|
-
logger.info('
|
528
|
+
logger.info('去重完成', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "Python", "数据处理": self.duplicate_keep_mode, "数据日期": dedup_end_date})
|
463
529
|
return (dup_count, affected_rows)
|
464
530
|
# SQL方式查找重复
|
465
|
-
temp_table = self.
|
531
|
+
temp_table = self._make_temp_table_name(table)
|
466
532
|
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
467
533
|
# 创建临时表时加where条件
|
468
|
-
create_temp_where = f"WHERE `{time_col}` >= '{
|
534
|
+
create_temp_where = f"WHERE `{time_col}` >= '{dedup_start_date}' AND `{time_col}` <= '{dedup_end_date}'" if has_time_col and dedup_start_date and dedup_end_date else ''
|
469
535
|
create_temp_sql = f"""
|
470
536
|
CREATE TABLE `{database}`.`{temp_table}` AS
|
471
537
|
SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
|
@@ -482,7 +548,7 @@ class MySQLDeduplicator:
|
|
482
548
|
dup_count_row = cursor.fetchone()
|
483
549
|
dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
|
484
550
|
if dup_count == 0:
|
485
|
-
logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count, "
|
551
|
+
logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count, "数据日期": dedup_end_date})
|
486
552
|
cursor.execute(drop_temp_sql)
|
487
553
|
conn.commit()
|
488
554
|
return (0, 0)
|
@@ -520,9 +586,9 @@ class MySQLDeduplicator:
|
|
520
586
|
break
|
521
587
|
if batch_deleted < self.batch_size:
|
522
588
|
break
|
523
|
-
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "
|
589
|
+
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "SQL", "数据处理": self.duplicate_keep_mode, "数据日期": dedup_end_date})
|
524
590
|
else:
|
525
|
-
logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "
|
591
|
+
logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组": dup_count})
|
526
592
|
affected_rows = 0
|
527
593
|
cursor.execute(drop_temp_sql)
|
528
594
|
conn.commit()
|
@@ -541,7 +607,8 @@ class MySQLDeduplicator:
|
|
541
607
|
logger.error('异常时清理临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
542
608
|
return (0, 0)
|
543
609
|
finally:
|
544
|
-
|
610
|
+
if lock_table:
|
611
|
+
self._release_table_lock(database, table)
|
545
612
|
|
546
613
|
def deduplicate_table(
|
547
614
|
self,
|
@@ -637,7 +704,7 @@ class MySQLDeduplicator:
|
|
637
704
|
logger.debug('提交表去重任务', {'库': database, '表': table, 'columns': columns})
|
638
705
|
futures[executor.submit(
|
639
706
|
self.deduplicate_table,
|
640
|
-
database, table, columns, dry_run, reorder_id,
|
707
|
+
database, table, columns, dry_run, reorder_id, use_python_dedup
|
641
708
|
)] = table
|
642
709
|
for future in concurrent.futures.as_completed(futures):
|
643
710
|
table = futures[future]
|
@@ -653,12 +720,12 @@ class MySQLDeduplicator:
|
|
653
720
|
for table in target_tables:
|
654
721
|
columns = columns_map.get(table) if columns_map else None
|
655
722
|
dup_count, affected_rows = self.deduplicate_table(
|
656
|
-
database, table, columns, dry_run, reorder_id,
|
723
|
+
database, table, columns, dry_run, reorder_id, use_python_dedup
|
657
724
|
)
|
658
725
|
results[table] = (dup_count, affected_rows)
|
659
726
|
total_dup = sum(r[0] for r in results.values())
|
660
727
|
total_del = sum(r[1] for r in results.values())
|
661
|
-
logger.info('单库完成', {"库": database, "
|
728
|
+
logger.info('单库完成', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": results})
|
662
729
|
return results
|
663
730
|
except Exception as e:
|
664
731
|
logger.error('发生全局错误', {"库": database, 'func': sys._getframe().f_code.co_name, "error": str(e), 'traceback': repr(e)})
|
@@ -708,7 +775,7 @@ class MySQLDeduplicator:
|
|
708
775
|
db_columns_map = columns_map.get(db) if columns_map else None
|
709
776
|
futures[executor.submit(
|
710
777
|
self.deduplicate_database,
|
711
|
-
db, tables, db_columns_map, dry_run, False, reorder_id,
|
778
|
+
db, tables, db_columns_map, dry_run, False, reorder_id, use_python_dedup
|
712
779
|
)] = db
|
713
780
|
for future in concurrent.futures.as_completed(futures):
|
714
781
|
db = futures[future]
|
@@ -724,7 +791,7 @@ class MySQLDeduplicator:
|
|
724
791
|
tables = tables_map.get(db) if tables_map else None
|
725
792
|
db_columns_map = columns_map.get(db) if columns_map else None
|
726
793
|
db_results = self.deduplicate_database(
|
727
|
-
db, tables, db_columns_map, dry_run, parallel, reorder_id,
|
794
|
+
db, tables, db_columns_map, dry_run, parallel, reorder_id, use_python_dedup
|
728
795
|
)
|
729
796
|
all_results[db] = db_results
|
730
797
|
total_dup = sum(
|
@@ -735,7 +802,7 @@ class MySQLDeduplicator:
|
|
735
802
|
r[1] for db in all_results.values()
|
736
803
|
for r in db.values()
|
737
804
|
)
|
738
|
-
logger.info('全局完成', {"
|
805
|
+
logger.info('全局完成', {"总重复组": total_dup, "总删除行": total_del, "详细结果": dict(all_results)})
|
739
806
|
return all_results
|
740
807
|
except Exception as e:
|
741
808
|
logger.error('异常', {"error": str(e), 'traceback': repr(e)})
|
@@ -865,22 +932,12 @@ class MySQLDeduplicator:
|
|
865
932
|
if not self._check_table_exists(database, table):
|
866
933
|
logger.warning('表不存在,跳过id重排', {"库": database, "表": table})
|
867
934
|
return False
|
868
|
-
# 检查id
|
869
|
-
|
870
|
-
with conn.cursor() as cursor:
|
871
|
-
cursor.execute("""
|
872
|
-
SELECT COLUMN_NAME, COLUMN_KEY
|
873
|
-
FROM INFORMATION_SCHEMA.COLUMNS
|
874
|
-
WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s
|
875
|
-
""", (database, table))
|
876
|
-
columns_info = cursor.fetchall()
|
877
|
-
columns = [row['COLUMN_NAME'] for row in columns_info]
|
878
|
-
id_is_pk = any(row['COLUMN_NAME'].lower() == id_column.lower() and row['COLUMN_KEY'] in ('PRI', 'UNI') for row in columns_info)
|
935
|
+
# 检查id列、主键信息(用_get_table_info)
|
936
|
+
columns, pk_cols, id_is_pk = self._get_table_info(database, table, id_column)
|
879
937
|
if id_column not in columns:
|
880
938
|
logger.warning('表无id列,跳过id重排', {"库": database, "表": table})
|
881
939
|
return False
|
882
940
|
# 检查主键是否为单列id
|
883
|
-
pk_cols = [row['COLUMN_NAME'] for row in columns_info if row['COLUMN_KEY'] == 'PRI']
|
884
941
|
if len(pk_cols) != 1 or pk_cols[0].lower() != id_column.lower():
|
885
942
|
logger.warning('主键不是单列id,跳过id重排', {"库": database, "表": table, "主键列": pk_cols})
|
886
943
|
return False
|
@@ -903,9 +960,9 @@ class MySQLDeduplicator:
|
|
903
960
|
if dry_run:
|
904
961
|
logger.info('dry_run模式,打印原表结构', {"库": database, "表": table, "建表语句": create_table_sql})
|
905
962
|
return True
|
906
|
-
temp_table = self.
|
963
|
+
temp_table = self._make_temp_table_name(table)
|
907
964
|
temp_table_quoted = f"`{database}`.`{temp_table}`"
|
908
|
-
backup_table = self.
|
965
|
+
backup_table = self._make_backup_table_name(table)
|
909
966
|
backup_table_quoted = f"`{database}`.`{backup_table}`"
|
910
967
|
try:
|
911
968
|
with self._get_connection() as conn:
|
@@ -1020,6 +1077,44 @@ class MySQLDeduplicator:
|
|
1020
1077
|
return (prefix + suffix)[:max_length]
|
1021
1078
|
return f"{prefix}{base[:remain]}{suffix}"[:max_length]
|
1022
1079
|
|
1080
|
+
def _get_table_info(self, database: str, table: str, id_column: str = None):
|
1081
|
+
"""
|
1082
|
+
获取表的所有列名、主键列名列表、指定id列是否为主键。
|
1083
|
+
Args:
|
1084
|
+
database (str): 数据库名。
|
1085
|
+
table (str): 表名。
|
1086
|
+
id_column (str): id列名,默认使用self.primary_key。
|
1087
|
+
Returns:
|
1088
|
+
Tuple[List[str], List[str], bool]: (所有列名, 主键列名, id列是否为主键)
|
1089
|
+
"""
|
1090
|
+
id_column = id_column or self.primary_key
|
1091
|
+
with self._get_connection() as conn:
|
1092
|
+
with conn.cursor() as cursor:
|
1093
|
+
cursor.execute("""
|
1094
|
+
SELECT COLUMN_NAME, COLUMN_KEY
|
1095
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
1096
|
+
WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s
|
1097
|
+
""", (database, table))
|
1098
|
+
columns_info = cursor.fetchall()
|
1099
|
+
columns = [row['COLUMN_NAME'] for row in columns_info]
|
1100
|
+
pk_cols = [row['COLUMN_NAME'] for row in columns_info if row['COLUMN_KEY'] == 'PRI']
|
1101
|
+
id_is_pk = any(row['COLUMN_NAME'].lower() == id_column.lower() and row['COLUMN_KEY'] in ('PRI', 'UNI') for row in columns_info)
|
1102
|
+
return columns, pk_cols, id_is_pk
|
1103
|
+
|
1104
|
+
def _make_temp_table_name(self, base: str) -> str:
|
1105
|
+
"""
|
1106
|
+
生成临时表名,带有 temp_ 前缀和 _dedup_ 进程线程后缀。
|
1107
|
+
"""
|
1108
|
+
suffix = f"_dedup_{os.getpid()}_{threading.get_ident()}"
|
1109
|
+
return self._make_safe_table_name(base, prefix="temp_", suffix=suffix)
|
1110
|
+
|
1111
|
+
def _make_backup_table_name(self, base: str) -> str:
|
1112
|
+
"""
|
1113
|
+
生成备份表名,带有 backup_ 前缀和时间戳+uuid后缀。
|
1114
|
+
"""
|
1115
|
+
suffix = f"_{int(time.time())}_{uuid.uuid4().hex[:8]}"
|
1116
|
+
return self._make_safe_table_name(base, prefix="backup_", suffix=suffix)
|
1117
|
+
|
1023
1118
|
|
1024
1119
|
def main():
|
1025
1120
|
deduplicator = MySQLDeduplicator(
|
@@ -1027,18 +1122,26 @@ def main():
|
|
1027
1122
|
password='pwd',
|
1028
1123
|
host='localhost',
|
1029
1124
|
port=3306,
|
1030
|
-
date_range=['2025-05-27', '2025-05-28'],
|
1031
|
-
exclude_tables={'推广数据2': [
|
1125
|
+
# date_range=['2025-05-27', '2025-05-28'],
|
1126
|
+
exclude_tables={'推广数据2': [
|
1127
|
+
# '地域报表_城市_2025_04',
|
1128
|
+
# '地域报表_城市_2025_05',
|
1129
|
+
# '地域报表_城市_2025_06',
|
1130
|
+
'地域报表_城市_2025_04_copy1',
|
1131
|
+
'地域报表_城市_2025_05_copy1',
|
1132
|
+
'地域报表_城市_2025_06_copy1',
|
1133
|
+
'主体报表_2025_copy1'
|
1134
|
+
]}
|
1032
1135
|
)
|
1033
1136
|
|
1034
1137
|
# 全库去重(单线程)
|
1035
|
-
deduplicator.deduplicate_all(dry_run=
|
1138
|
+
deduplicator.deduplicate_all(dry_run=True, parallel=True, reorder_id=True)
|
1036
1139
|
|
1037
1140
|
# # 指定数据库去重(多线程)
|
1038
|
-
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=False, reorder_id=
|
1141
|
+
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=False, reorder_id=True)
|
1039
1142
|
|
1040
1143
|
# # 指定表去重(使用特定列)
|
1041
|
-
# deduplicator.deduplicate_table('my_db', 'my_table', columns=[
|
1144
|
+
# deduplicator.deduplicate_table('my_db', 'my_table', columns=["name", "date"], dry_run=False, reorder_id=False)
|
1042
1145
|
|
1043
1146
|
# # 重排id列
|
1044
1147
|
# deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
|
@@ -1047,5 +1150,5 @@ def main():
|
|
1047
1150
|
deduplicator.close()
|
1048
1151
|
|
1049
1152
|
if __name__ == '__main__':
|
1050
|
-
main()
|
1153
|
+
# main()
|
1051
1154
|
pass
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=L9HK2W1LgO8Zc5gpJgI1uJ5J0VRcUyMXHr1ZT-FeNOM,19
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -8,7 +8,7 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
|
8
8
|
mdbq/log/mylogger.py,sha256=Crw6LwVo3I3IUbzIETu8f46Quza3CTCh-qYf4edbBPo,24139
|
9
9
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
|
-
mdbq/mysql/deduplicator.py,sha256=
|
11
|
+
mdbq/mysql/deduplicator.py,sha256=w8etA5dAsY7g58bWU3SQt7n_OWnS9Y2TVh0D7m0MK9E,57961
|
12
12
|
mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
|
13
13
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
14
14
|
mdbq/mysql/uploader.py,sha256=8Px_W2bYOr1wQgMXMK0DggNiuE6a6Ul4BlJake8LSo8,64469
|
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
24
24
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
25
25
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
26
26
|
mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
|
27
|
-
mdbq-3.11.
|
28
|
-
mdbq-3.11.
|
29
|
-
mdbq-3.11.
|
30
|
-
mdbq-3.11.
|
27
|
+
mdbq-3.11.10.dist-info/METADATA,sha256=dVhkC84iq1GWtV6onfsLj18CwfGnIo1bXXDa-TXUU1E,365
|
28
|
+
mdbq-3.11.10.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-3.11.10.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-3.11.10.dist-info/RECORD,,
|
File without changes
|
File without changes
|