mdbq 3.11.11__py3-none-any.whl → 3.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/deduplicator.py +127 -63
- {mdbq-3.11.11.dist-info → mdbq-3.12.0.dist-info}/METADATA +1 -1
- {mdbq-3.11.11.dist-info → mdbq-3.12.0.dist-info}/RECORD +6 -6
- {mdbq-3.11.11.dist-info → mdbq-3.12.0.dist-info}/WHEEL +0 -0
- {mdbq-3.11.11.dist-info → mdbq-3.12.0.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.
|
1
|
+
VERSION = '3.12.0'
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -14,6 +14,7 @@ from collections import defaultdict
|
|
14
14
|
import sys
|
15
15
|
from datetime import datetime
|
16
16
|
import uuid
|
17
|
+
from contextlib import contextmanager
|
17
18
|
|
18
19
|
|
19
20
|
warnings.filterwarnings('ignore')
|
@@ -47,8 +48,8 @@ class MySQLDeduplicator:
|
|
47
48
|
batch_size: int = 1000,
|
48
49
|
skip_system_dbs: bool = True,
|
49
50
|
max_retries: int = 3,
|
50
|
-
|
51
|
-
pool_size: int =
|
51
|
+
retry_waiting_time: int = 5,
|
52
|
+
pool_size: int = 10,
|
52
53
|
primary_key: str = 'id',
|
53
54
|
date_range: Optional[List[str]] = None,
|
54
55
|
recent_month: Optional[int] = None,
|
@@ -87,15 +88,30 @@ class MySQLDeduplicator:
|
|
87
88
|
cursorclass=pymysql.cursors.DictCursor
|
88
89
|
)
|
89
90
|
|
91
|
+
# 并发模式要将 pool_size 加大
|
92
|
+
MAX_POOL_SIZE = 200
|
93
|
+
MAX_WORKERS = 4
|
94
|
+
if max_workers > MAX_WORKERS:
|
95
|
+
logger.warning(f"max_workers({max_workers}) 超过最大建议值({MAX_WORKERS}),自动将 max_workers 调整为 {MAX_WORKERS}")
|
96
|
+
max_workers = MAX_WORKERS
|
97
|
+
expected_threads = max_workers * 10
|
98
|
+
if pool_size < expected_threads:
|
99
|
+
logger.warning(f"pool_size({pool_size}) < max_workers({max_workers}) * 10,自动将 pool_size 调整为 {expected_threads}")
|
100
|
+
pool_size = expected_threads
|
101
|
+
if pool_size > MAX_POOL_SIZE:
|
102
|
+
logger.warning(f"pool_size({pool_size}) 超过最大建议值({MAX_POOL_SIZE}),自动将 pool_size 调整为 {MAX_POOL_SIZE}")
|
103
|
+
pool_size = MAX_POOL_SIZE
|
104
|
+
self.max_workers = max_workers
|
105
|
+
self.pool_size = pool_size
|
106
|
+
|
90
107
|
# 配置参数
|
91
|
-
self.max_workers = min(max(1, max_workers), pool_size) # 限制最大线程数,不能超过连接池
|
92
108
|
self.batch_size = batch_size
|
93
109
|
self.skip_system_dbs = skip_system_dbs
|
94
110
|
self.max_retries = max_retries
|
95
|
-
self.
|
111
|
+
self.retry_waiting_time = retry_waiting_time
|
96
112
|
self.primary_key = primary_key
|
97
113
|
|
98
|
-
#
|
114
|
+
# 时间范围参数
|
99
115
|
self.date_column = date_column
|
100
116
|
self._dedup_start_date = None
|
101
117
|
self._dedup_end_date = None
|
@@ -128,6 +144,9 @@ class MySQLDeduplicator:
|
|
128
144
|
year -= 1
|
129
145
|
self._dedup_start_date = f"{year}-{month:02d}-01"
|
130
146
|
self._dedup_end_date = today.strftime("%Y-%m-%d")
|
147
|
+
|
148
|
+
if self._dedup_start_date and self._dedup_end_date:
|
149
|
+
logger.info('去重日期范围', {'开始': self._dedup_start_date, '结束': self._dedup_end_date})
|
131
150
|
|
132
151
|
# 排除列处理,直接合并去重
|
133
152
|
self.exclude_columns = list(set((exclude_columns or []) + ['id', '更新时间']))
|
@@ -164,6 +183,14 @@ class MySQLDeduplicator:
|
|
164
183
|
logger.error(f"获取数据库连接失败: {str(e)}", {'error_type': type(e).__name__})
|
165
184
|
raise ConnectionError(f"连接数据库失败: {str(e)}")
|
166
185
|
|
186
|
+
@contextmanager
|
187
|
+
def _conn_ctx(self):
|
188
|
+
conn = self._get_connection()
|
189
|
+
try:
|
190
|
+
yield conn
|
191
|
+
finally:
|
192
|
+
conn.close()
|
193
|
+
|
167
194
|
@staticmethod
|
168
195
|
def _retry_on_failure(func: Any) -> Any:
|
169
196
|
"""
|
@@ -187,7 +214,7 @@ class MySQLDeduplicator:
|
|
187
214
|
except (pymysql.OperationalError, pymysql.InterfaceError) as e:
|
188
215
|
last_exception = e
|
189
216
|
if attempt < self.max_retries:
|
190
|
-
wait_time = self.
|
217
|
+
wait_time = self.retry_waiting_time * (attempt + 1)
|
191
218
|
logger.warning(
|
192
219
|
f"数据库操作失败,准备重试 (尝试 {attempt + 1}/{self.max_retries})",
|
193
220
|
{'error': str(e), 'wait_time': wait_time, 'func': func.__name__})
|
@@ -203,7 +230,6 @@ class MySQLDeduplicator:
|
|
203
230
|
raise Exception("未知错误")
|
204
231
|
return wrapper
|
205
232
|
|
206
|
-
@_retry_on_failure
|
207
233
|
def _get_databases(self) -> List[str]:
|
208
234
|
"""
|
209
235
|
获取所有非系统数据库列表,排除 exclude_databases。
|
@@ -212,7 +238,7 @@ class MySQLDeduplicator:
|
|
212
238
|
List[str]: 数据库名列表。
|
213
239
|
"""
|
214
240
|
sql = "SHOW DATABASES"
|
215
|
-
with self.
|
241
|
+
with self._conn_ctx() as conn:
|
216
242
|
with conn.cursor() as cursor:
|
217
243
|
cursor.execute(sql)
|
218
244
|
all_dbs = [row['Database'] for row in cursor.fetchall()]
|
@@ -220,7 +246,6 @@ class MySQLDeduplicator:
|
|
220
246
|
filtered = [db for db in all_dbs if db.lower() not in self.SYSTEM_DATABASES and db.lower() not in self.exclude_databases] if self.skip_system_dbs else [db for db in all_dbs if db.lower() not in self.exclude_databases]
|
221
247
|
return filtered
|
222
248
|
|
223
|
-
@_retry_on_failure
|
224
249
|
def _get_tables(self, database: str) -> List[str]:
|
225
250
|
"""
|
226
251
|
获取指定数据库的所有表名(排除 temp_ 前缀的临时表)。
|
@@ -231,15 +256,12 @@ class MySQLDeduplicator:
|
|
231
256
|
List[str]: 表名列表。
|
232
257
|
"""
|
233
258
|
sql = "SHOW TABLES"
|
234
|
-
|
235
|
-
with self._get_connection() as conn:
|
259
|
+
with self._conn_ctx() as conn:
|
236
260
|
with conn.cursor() as cursor:
|
237
261
|
cursor.execute(f"USE `{database}`")
|
238
262
|
cursor.execute(sql)
|
239
|
-
# 严格过滤所有以'temp_'为前缀的表名(如temp_xxx、temp_xxx_dedup_...、temp_xxx_reorderid_...等)
|
240
263
|
return [row[f'Tables_in_{database}'] for row in cursor.fetchall() if not re.match(r'^temp_.*', row[f'Tables_in_{database}'])]
|
241
264
|
|
242
|
-
@_retry_on_failure
|
243
265
|
def _get_table_columns(self, database: str, table: str) -> List[str]:
|
244
266
|
"""
|
245
267
|
获取指定表的所有列名(排除主键列)。
|
@@ -256,14 +278,12 @@ class MySQLDeduplicator:
|
|
256
278
|
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
257
279
|
ORDER BY ORDINAL_POSITION
|
258
280
|
"""
|
259
|
-
|
260
|
-
with self._get_connection() as conn:
|
281
|
+
with self._conn_ctx() as conn:
|
261
282
|
with conn.cursor() as cursor:
|
262
283
|
cursor.execute(sql, (database, table))
|
263
284
|
return [row['COLUMN_NAME'] for row in cursor.fetchall()
|
264
285
|
if row['COLUMN_NAME'].lower() != self.primary_key.lower()]
|
265
286
|
|
266
|
-
@_retry_on_failure
|
267
287
|
def _ensure_index(self, database: str, table: str, date_column: str) -> None:
|
268
288
|
"""
|
269
289
|
检查并为 date_column 自动创建索引(如果未存在)。
|
@@ -273,7 +293,7 @@ class MySQLDeduplicator:
|
|
273
293
|
table (str): 表名。
|
274
294
|
date_column (str): 需要检查的日期列名。
|
275
295
|
"""
|
276
|
-
with self.
|
296
|
+
with self._conn_ctx() as conn:
|
277
297
|
with conn.cursor() as cursor:
|
278
298
|
# 检查索引是否已存在
|
279
299
|
cursor.execute(
|
@@ -295,7 +315,33 @@ class MySQLDeduplicator:
|
|
295
315
|
except Exception as e:
|
296
316
|
logger.error('自动创建date_column索引失败', {"库": database, "表": table, "date_column": date_column, "异常": str(e)})
|
297
317
|
|
298
|
-
|
318
|
+
def _row_generator(self, database, table, select_cols, select_where, batch_size=10000):
|
319
|
+
"""
|
320
|
+
生成器:分批拉取表数据,避免一次性加载全部数据到内存。
|
321
|
+
Args:
|
322
|
+
database (str): 数据库名。
|
323
|
+
table (str): 表名。
|
324
|
+
select_cols (str): 选择的列字符串。
|
325
|
+
select_where (str): where条件字符串。
|
326
|
+
batch_size (int): 每批拉取的行数。
|
327
|
+
Yields:
|
328
|
+
dict: 每行数据。
|
329
|
+
"""
|
330
|
+
offset = 0
|
331
|
+
while True:
|
332
|
+
sql = f"SELECT {select_cols} FROM `{database}`.`{table}` {select_where} LIMIT {batch_size} OFFSET {offset}"
|
333
|
+
with self._conn_ctx() as conn:
|
334
|
+
with conn.cursor() as cursor:
|
335
|
+
cursor.execute(sql)
|
336
|
+
rows = cursor.fetchall()
|
337
|
+
if not rows:
|
338
|
+
break
|
339
|
+
for row in rows:
|
340
|
+
yield row
|
341
|
+
if len(rows) < batch_size:
|
342
|
+
break
|
343
|
+
offset += batch_size
|
344
|
+
|
299
345
|
def _get_all_dates(self, database: str, table: str, date_column: str) -> List[str]:
|
300
346
|
"""
|
301
347
|
获取表中所有不同的日期分区(按天)。
|
@@ -308,7 +354,7 @@ class MySQLDeduplicator:
|
|
308
354
|
List[str]: 所有不同的日期(字符串)。
|
309
355
|
"""
|
310
356
|
sql = f"SELECT DISTINCT `{date_column}` FROM `{database}`.`{table}` ORDER BY `{date_column}` ASC"
|
311
|
-
with self.
|
357
|
+
with self._conn_ctx() as conn:
|
312
358
|
with conn.cursor() as cursor:
|
313
359
|
cursor.execute(sql)
|
314
360
|
return [row[date_column] for row in cursor.fetchall() if row[date_column] is not None]
|
@@ -367,7 +413,7 @@ class MySQLDeduplicator:
|
|
367
413
|
pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
|
368
414
|
where_sql = f"t.`{time_col}` = '{date_val}'"
|
369
415
|
# 获取原始数据总量(只统计当天数据)
|
370
|
-
with self.
|
416
|
+
with self._conn_ctx() as conn:
|
371
417
|
with conn.cursor() as cursor:
|
372
418
|
count_where = f"WHERE `{time_col}` = '{date_val}'"
|
373
419
|
count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}` {count_where}"
|
@@ -394,7 +440,7 @@ class MySQLDeduplicator:
|
|
394
440
|
del_ids.extend(ids[1:])
|
395
441
|
affected_rows = 0
|
396
442
|
if not dry_run and del_ids:
|
397
|
-
with self.
|
443
|
+
with self._conn_ctx() as conn:
|
398
444
|
with conn.cursor() as cursor:
|
399
445
|
for i in range(0, len(del_ids), self.batch_size):
|
400
446
|
batch_ids = del_ids[i:i+self.batch_size]
|
@@ -418,7 +464,7 @@ class MySQLDeduplicator:
|
|
418
464
|
GROUP BY {column_list}
|
419
465
|
HAVING COUNT(*) > 1
|
420
466
|
"""
|
421
|
-
with self.
|
467
|
+
with self._conn_ctx() as conn:
|
422
468
|
with conn.cursor() as cursor:
|
423
469
|
logger.debug('创建临时表SQL', {'sql': create_temp_sql})
|
424
470
|
cursor.execute(create_temp_sql)
|
@@ -484,7 +530,7 @@ class MySQLDeduplicator:
|
|
484
530
|
pk = self.primary_key
|
485
531
|
pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
|
486
532
|
# 获取原始数据总量
|
487
|
-
with self.
|
533
|
+
with self._conn_ctx() as conn:
|
488
534
|
with conn.cursor() as cursor:
|
489
535
|
count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`"
|
490
536
|
logger.debug('执行SQL', {'sql': count_sql})
|
@@ -508,7 +554,7 @@ class MySQLDeduplicator:
|
|
508
554
|
del_ids.extend(ids[1:])
|
509
555
|
affected_rows = 0
|
510
556
|
if not dry_run and del_ids:
|
511
|
-
with self.
|
557
|
+
with self._conn_ctx() as conn:
|
512
558
|
with conn.cursor() as cursor:
|
513
559
|
for i in range(0, len(del_ids), self.batch_size):
|
514
560
|
batch_ids = del_ids[i:i+self.batch_size]
|
@@ -529,7 +575,7 @@ class MySQLDeduplicator:
|
|
529
575
|
GROUP BY {column_list}
|
530
576
|
HAVING COUNT(*) > 1
|
531
577
|
"""
|
532
|
-
with self.
|
578
|
+
with self._conn_ctx() as conn:
|
533
579
|
with conn.cursor() as cursor:
|
534
580
|
logger.debug('创建临时表SQL', {'sql': create_temp_sql})
|
535
581
|
cursor.execute(create_temp_sql)
|
@@ -584,7 +630,7 @@ class MySQLDeduplicator:
|
|
584
630
|
logger.error('异常', {"库": database, "表": table, "异常": str(e), 'func': sys._getframe().f_code.co_name, 'traceback': repr(e)})
|
585
631
|
if temp_table:
|
586
632
|
try:
|
587
|
-
with self.
|
633
|
+
with self._conn_ctx() as conn:
|
588
634
|
with conn.cursor() as cursor:
|
589
635
|
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
590
636
|
cursor.execute(drop_temp_sql)
|
@@ -628,13 +674,14 @@ class MySQLDeduplicator:
|
|
628
674
|
logger.info('单表开始', {
|
629
675
|
"库": database,
|
630
676
|
"表": table,
|
631
|
-
"参数": {
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
677
|
+
# "参数": {
|
678
|
+
# "指定去重列": columns,
|
679
|
+
# "去重方式": "Python" if use_python_dedup else "SQL",
|
680
|
+
# "数据处理": self.duplicate_keep_mode,
|
681
|
+
# "模拟运行": dry_run,
|
682
|
+
# '排除列': self.exclude_columns,
|
683
|
+
# },
|
684
|
+
})
|
638
685
|
all_columns = self._get_table_columns(database, table)
|
639
686
|
all_columns_lower = [col.lower() for col in all_columns]
|
640
687
|
time_col = self.date_column
|
@@ -680,7 +727,7 @@ class MySQLDeduplicator:
|
|
680
727
|
logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
|
681
728
|
total_dup += dup_count
|
682
729
|
total_del += affected_rows
|
683
|
-
logger.
|
730
|
+
logger.debug('单表完成', {"库": database, "表": table, "结果[重复, 删除]": (total_dup, total_del), '日期范围': f"{start_date} - {end_date}"})
|
684
731
|
# 自动重排id列(仅当有实际删除时且reorder_id为True)
|
685
732
|
if reorder_id and total_del > 0:
|
686
733
|
try:
|
@@ -688,10 +735,12 @@ class MySQLDeduplicator:
|
|
688
735
|
logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
|
689
736
|
except Exception as e:
|
690
737
|
logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
|
738
|
+
if affected_rows > 0:
|
739
|
+
logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": total_dup, "实际删除": total_del})
|
691
740
|
return (total_dup, total_del)
|
692
741
|
# 没有date_column,直接全表去重
|
693
742
|
result = self._deduplicate_table(database, table, columns, dry_run, use_python_dedup, date_val=None)
|
694
|
-
logger.
|
743
|
+
logger.debug('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result, '日期范围': '全表'})
|
695
744
|
dup_count, affected_rows = result
|
696
745
|
if reorder_id and affected_rows > 0:
|
697
746
|
try:
|
@@ -699,6 +748,8 @@ class MySQLDeduplicator:
|
|
699
748
|
logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
|
700
749
|
except Exception as e:
|
701
750
|
logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
|
751
|
+
if affected_rows > 0:
|
752
|
+
logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": dup_count, "实际删除": affected_rows})
|
702
753
|
return result
|
703
754
|
except Exception as e:
|
704
755
|
logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
|
@@ -770,7 +821,11 @@ class MySQLDeduplicator:
|
|
770
821
|
results[table] = (dup_count, affected_rows)
|
771
822
|
total_dup = sum(r[0] for r in results.values())
|
772
823
|
total_del = sum(r[1] for r in results.values())
|
773
|
-
logger.
|
824
|
+
logger.debug('库完成', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": results})
|
825
|
+
# 只显示有删除的详细结果
|
826
|
+
if total_del > 0:
|
827
|
+
filtered_results = {tbl: res for tbl, res in results.items() if res[1] > 0}
|
828
|
+
logger.info('库完成(仅显示有删除的结果)', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": filtered_results})
|
774
829
|
return results
|
775
830
|
except Exception as e:
|
776
831
|
logger.error('发生全局错误', {"库": database, 'func': sys._getframe().f_code.co_name, "error": str(e), 'traceback': repr(e)})
|
@@ -819,7 +874,8 @@ class MySQLDeduplicator:
|
|
819
874
|
'use_python_dedup': use_python_dedup
|
820
875
|
},
|
821
876
|
})
|
822
|
-
|
877
|
+
# 如果parallel=True且库数量大于1,则只在外层并发,内层串行
|
878
|
+
if parallel and self.max_workers > 1 and len(target_dbs) > 1:
|
823
879
|
with concurrent.futures.ThreadPoolExecutor(
|
824
880
|
max_workers=self.max_workers
|
825
881
|
) as executor:
|
@@ -827,6 +883,7 @@ class MySQLDeduplicator:
|
|
827
883
|
for db in target_dbs:
|
828
884
|
tables = tables_map.get(db) if tables_map else None
|
829
885
|
db_columns_map = columns_map.get(db) if columns_map else None
|
886
|
+
# 内层强制串行
|
830
887
|
futures[executor.submit(
|
831
888
|
self.deduplicate_database,
|
832
889
|
db, tables, db_columns_map, dry_run, False, reorder_id, use_python_dedup
|
@@ -855,7 +912,7 @@ class MySQLDeduplicator:
|
|
855
912
|
r[1] for db in all_results.values()
|
856
913
|
for r in db.values()
|
857
914
|
)
|
858
|
-
logger.
|
915
|
+
logger.debug('全局完成', {
|
859
916
|
"总重复组": total_dup,
|
860
917
|
"总删除行": total_del,
|
861
918
|
"参数": {
|
@@ -867,12 +924,30 @@ class MySQLDeduplicator:
|
|
867
924
|
},
|
868
925
|
"详细结果": dict(all_results)
|
869
926
|
})
|
927
|
+
# 只显示有删除的详细结果
|
928
|
+
if total_del > 0:
|
929
|
+
filtered_results = {
|
930
|
+
db: {tbl: res for tbl, res in tbls.items() if res[1] > 0}
|
931
|
+
for db, tbls in all_results.items()
|
932
|
+
}
|
933
|
+
filtered_results = {db: tbls for db, tbls in filtered_results.items() if tbls}
|
934
|
+
logger.info('全局完成(仅显示有删除的结果)', {
|
935
|
+
"总重复组": total_dup,
|
936
|
+
"总删除行": total_del,
|
937
|
+
"参数": {
|
938
|
+
"模拟运行": dry_run,
|
939
|
+
"并行处理": parallel,
|
940
|
+
'排除列': self.exclude_columns,
|
941
|
+
'重排id': reorder_id,
|
942
|
+
'use_python_dedup': use_python_dedup
|
943
|
+
},
|
944
|
+
"详细结果": filtered_results
|
945
|
+
})
|
870
946
|
return all_results
|
871
947
|
except Exception as e:
|
872
948
|
logger.error('异常', {"error": str(e), 'traceback': repr(e)})
|
873
949
|
return all_results
|
874
950
|
|
875
|
-
@_retry_on_failure
|
876
951
|
def _check_database_exists(self, database: str) -> bool:
|
877
952
|
"""
|
878
953
|
检查数据库是否存在。
|
@@ -883,13 +958,11 @@ class MySQLDeduplicator:
|
|
883
958
|
bool: 数据库是否存在。
|
884
959
|
"""
|
885
960
|
sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
|
886
|
-
|
887
|
-
with self._get_connection() as conn:
|
961
|
+
with self._conn_ctx() as conn:
|
888
962
|
with conn.cursor() as cursor:
|
889
963
|
cursor.execute(sql, (database,))
|
890
964
|
return bool(cursor.fetchone())
|
891
965
|
|
892
|
-
@_retry_on_failure
|
893
966
|
def _check_table_exists(self, database: str, table: str) -> bool:
|
894
967
|
"""
|
895
968
|
检查表是否存在。
|
@@ -905,13 +978,11 @@ class MySQLDeduplicator:
|
|
905
978
|
FROM INFORMATION_SCHEMA.TABLES
|
906
979
|
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
907
980
|
"""
|
908
|
-
|
909
|
-
with self._get_connection() as conn:
|
981
|
+
with self._conn_ctx() as conn:
|
910
982
|
with conn.cursor() as cursor:
|
911
983
|
cursor.execute(sql, (database, table))
|
912
984
|
return bool(cursor.fetchone())
|
913
985
|
|
914
|
-
@_retry_on_failure
|
915
986
|
def _get_table_info(self, database: str, table: str, id_column: str = None):
|
916
987
|
"""
|
917
988
|
获取表的所有列名、主键列名列表、指定id列是否为主键。
|
@@ -923,7 +994,7 @@ class MySQLDeduplicator:
|
|
923
994
|
Tuple[List[str], List[str], bool]: (所有列名, 主键列名, id列是否为主键)
|
924
995
|
"""
|
925
996
|
id_column = id_column or self.primary_key
|
926
|
-
with self.
|
997
|
+
with self._conn_ctx() as conn:
|
927
998
|
with conn.cursor() as cursor:
|
928
999
|
cursor.execute("""
|
929
1000
|
SELECT COLUMN_NAME, COLUMN_KEY
|
@@ -1032,7 +1103,7 @@ class MySQLDeduplicator:
|
|
1032
1103
|
logger.warning('主键不是单列id,跳过id重排', {"库": database, "表": table, "主键列": pk_cols})
|
1033
1104
|
return False
|
1034
1105
|
# 检查外键约束
|
1035
|
-
with self.
|
1106
|
+
with self._conn_ctx() as conn:
|
1036
1107
|
with conn.cursor() as cursor:
|
1037
1108
|
cursor.execute("""
|
1038
1109
|
SELECT * FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
|
@@ -1042,7 +1113,7 @@ class MySQLDeduplicator:
|
|
1042
1113
|
logger.warning('表存在外键约束,跳过id重排', {"库": database, "表": table})
|
1043
1114
|
return False
|
1044
1115
|
# 获取表结构
|
1045
|
-
with self.
|
1116
|
+
with self._conn_ctx() as conn:
|
1046
1117
|
with conn.cursor() as cursor:
|
1047
1118
|
cursor.execute(f"SHOW CREATE TABLE {table_quoted}")
|
1048
1119
|
create_table_sql = cursor.fetchone()['Create Table']
|
@@ -1055,7 +1126,7 @@ class MySQLDeduplicator:
|
|
1055
1126
|
backup_table = self._make_backup_table_name(table)
|
1056
1127
|
backup_table_quoted = f"`{database}`.`{backup_table}`"
|
1057
1128
|
try:
|
1058
|
-
with self.
|
1129
|
+
with self._conn_ctx() as conn:
|
1059
1130
|
with conn.cursor() as cursor:
|
1060
1131
|
# 1. 创建临时表,结构同原表
|
1061
1132
|
try:
|
@@ -1116,7 +1187,7 @@ class MySQLDeduplicator:
|
|
1116
1187
|
logger.error('回滚恢复原表失败', {"库": database, "表": table, "异常": str(e)})
|
1117
1188
|
return False
|
1118
1189
|
logger.info('id重排成功且数据量一致', {"库": database, "表": table, "新表": new_cnt, "备份表": old_cnt, "备份表名": backup_table})
|
1119
|
-
# 5.
|
1190
|
+
# 5. 自动删除备份表
|
1120
1191
|
if auto_drop_backup:
|
1121
1192
|
try:
|
1122
1193
|
cursor.execute(f"DROP TABLE {backup_table_quoted}")
|
@@ -1127,7 +1198,7 @@ class MySQLDeduplicator:
|
|
1127
1198
|
except Exception as e:
|
1128
1199
|
logger.error('id重排异常,准备回滚', {"库": database, "表": table, "异常": str(e)})
|
1129
1200
|
# 回滚:如临时表存在则删掉,恢复原表结构
|
1130
|
-
with self.
|
1201
|
+
with self._conn_ctx() as conn:
|
1131
1202
|
with conn.cursor() as cursor:
|
1132
1203
|
try:
|
1133
1204
|
cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
|
@@ -1135,7 +1206,7 @@ class MySQLDeduplicator:
|
|
1135
1206
|
logger.error('回滚时删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
1136
1207
|
# 恢复原表(如备份表存在)
|
1137
1208
|
try:
|
1138
|
-
with self.
|
1209
|
+
with self._conn_ctx() as conn2:
|
1139
1210
|
with conn2.cursor() as cursor2:
|
1140
1211
|
if self._check_table_exists(database, backup_table):
|
1141
1212
|
cursor2.execute(f"DROP TABLE IF EXISTS {table_quoted}")
|
@@ -1227,23 +1298,16 @@ def main():
|
|
1227
1298
|
batch_size=1000,
|
1228
1299
|
skip_system_dbs=True,
|
1229
1300
|
max_retries=3,
|
1230
|
-
|
1231
|
-
pool_size=
|
1301
|
+
retry_waiting_time=5,
|
1302
|
+
# pool_size=30,
|
1232
1303
|
recent_month=1,
|
1233
1304
|
# date_range=['2025-06-09', '2025-06-10'],
|
1234
1305
|
date_column='日期',
|
1235
|
-
exclude_columns=None,
|
1236
1306
|
exclude_databases=['测试库4'],
|
1237
1307
|
exclude_tables={
|
1238
1308
|
'推广数据2': [
|
1239
1309
|
'地域报表_城市_2025_04',
|
1240
|
-
'地域报表_城市_2025_05',
|
1241
|
-
'地域报表_城市_2025_06',
|
1242
1310
|
# '地域报表_城市_2025_04_copy1',
|
1243
|
-
# '地域报表_城市_2025_05_copy1',
|
1244
|
-
# '地域报表_城市_2025_06_copy1',
|
1245
|
-
'奥莱店_主体报表',
|
1246
|
-
# '奥莱店_主体报表_copy1',
|
1247
1311
|
],
|
1248
1312
|
"生意参谋3": [
|
1249
1313
|
"商品排行_2025",
|
@@ -1255,10 +1319,10 @@ def main():
|
|
1255
1319
|
deduplicator.deduplicate_all(dry_run=False, parallel=True, reorder_id=True)
|
1256
1320
|
|
1257
1321
|
# # 指定数据库去重(多线程)
|
1258
|
-
# deduplicator.deduplicate_database('
|
1322
|
+
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True, reorder_id=True)
|
1259
1323
|
|
1260
1324
|
# # 指定表去重(使用特定列)
|
1261
|
-
# deduplicator.deduplicate_table('
|
1325
|
+
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'data'], dry_run=False, reorder_id=True)
|
1262
1326
|
|
1263
1327
|
# # 重排id列
|
1264
1328
|
# deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=W8WVhYkHLU0SBDlL9Q6XQVTqIrzYjc1kFBZgqzS_NEI,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -8,7 +8,7 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
|
8
8
|
mdbq/log/mylogger.py,sha256=Crw6LwVo3I3IUbzIETu8f46Quza3CTCh-qYf4edbBPo,24139
|
9
9
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
|
-
mdbq/mysql/deduplicator.py,sha256=
|
11
|
+
mdbq/mysql/deduplicator.py,sha256=KMJ_YyqAniaLVRqOHLgO92PgwknIDB-EgaOY7S6iMZ4,68599
|
12
12
|
mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
|
13
13
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
14
14
|
mdbq/mysql/uploader.py,sha256=8Px_W2bYOr1wQgMXMK0DggNiuE6a6Ul4BlJake8LSo8,64469
|
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
24
24
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
25
25
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
26
26
|
mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
|
27
|
-
mdbq-3.
|
28
|
-
mdbq-3.
|
29
|
-
mdbq-3.
|
30
|
-
mdbq-3.
|
27
|
+
mdbq-3.12.0.dist-info/METADATA,sha256=Q6EyaC61H4okFva6YFV2a0Y3Iqun8L8mnpSkeVXcFdc,364
|
28
|
+
mdbq-3.12.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-3.12.0.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-3.12.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|