mdbq 3.11.9__py3-none-any.whl → 3.11.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/deduplicator.py +452 -232
- {mdbq-3.11.9.dist-info → mdbq-3.11.11.dist-info}/METADATA +1 -1
- {mdbq-3.11.9.dist-info → mdbq-3.11.11.dist-info}/RECORD +6 -6
- {mdbq-3.11.9.dist-info → mdbq-3.11.11.dist-info}/WHEEL +0 -0
- {mdbq-3.11.9.dist-info → mdbq-3.11.11.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.11.
|
1
|
+
VERSION = '3.11.11'
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -34,32 +34,6 @@ logger = mylogger.MyLogger(
|
|
34
34
|
class MySQLDeduplicator:
|
35
35
|
"""
|
36
36
|
MySQL数据去重
|
37
|
-
|
38
|
-
功能:
|
39
|
-
1. 自动检测并删除MySQL数据库中的重复数据
|
40
|
-
2. 支持全库扫描或指定表处理
|
41
|
-
3. 支持多线程/多进程安全处理
|
42
|
-
4. 完善的错误处理和日志记录
|
43
|
-
|
44
|
-
使用示例:
|
45
|
-
deduplicator = MySQLDeduplicator(
|
46
|
-
username='root',
|
47
|
-
password='password',
|
48
|
-
host='localhost',
|
49
|
-
port=3306
|
50
|
-
)
|
51
|
-
|
52
|
-
# 全库去重
|
53
|
-
deduplicator.deduplicate_all()
|
54
|
-
|
55
|
-
# 指定数据库去重(多线程)
|
56
|
-
deduplicator.deduplicate_database('my_db', parallel=True)
|
57
|
-
|
58
|
-
# 指定表去重(使用特定列)
|
59
|
-
deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
|
60
|
-
|
61
|
-
# 关闭连接
|
62
|
-
deduplicator.close()
|
63
37
|
"""
|
64
38
|
|
65
39
|
def __init__(
|
@@ -69,7 +43,7 @@ class MySQLDeduplicator:
|
|
69
43
|
host: str = 'localhost',
|
70
44
|
port: int = 3306,
|
71
45
|
charset: str = 'utf8mb4',
|
72
|
-
max_workers: int =
|
46
|
+
max_workers: int = 2,
|
73
47
|
batch_size: int = 1000,
|
74
48
|
skip_system_dbs: bool = True,
|
75
49
|
max_retries: int = 3,
|
@@ -121,27 +95,17 @@ class MySQLDeduplicator:
|
|
121
95
|
self.retry_interval = retry_interval
|
122
96
|
self.primary_key = primary_key
|
123
97
|
|
124
|
-
#
|
125
|
-
self.date_range = date_range
|
126
|
-
self.recent_month = recent_month
|
98
|
+
# 时间范围参数(只保留解析后的结果,去除冗余原始参数)
|
127
99
|
self.date_column = date_column
|
128
100
|
self._dedup_start_date = None
|
129
101
|
self._dedup_end_date = None
|
130
|
-
|
131
|
-
default_exclude = {'id'}
|
132
|
-
# exclude_columns 不传则排除: ['id', '更新时间']
|
133
|
-
if not exclude_columns:
|
134
|
-
self.exclude_columns = list(default_exclude | {'更新时间'})
|
135
|
-
else:
|
136
|
-
self.exclude_columns = list(set(exclude_columns) | default_exclude)
|
137
|
-
# 解析时间范围并智能校正date_range
|
138
|
-
if self.date_range and len(self.date_range) == 2:
|
102
|
+
if date_range and len(date_range) == 2:
|
139
103
|
try:
|
140
|
-
start, end =
|
104
|
+
start, end = date_range
|
141
105
|
start_dt = datetime.strptime(start, "%Y-%m-%d")
|
142
106
|
end_dt = datetime.strptime(end, "%Y-%m-%d")
|
143
107
|
if start_dt > end_dt:
|
144
|
-
logger.
|
108
|
+
logger.debug(
|
145
109
|
"date_range顺序不正确,自动交换开始和结束日期。",
|
146
110
|
{"start": start, "end": end}
|
147
111
|
)
|
@@ -151,13 +115,13 @@ class MySQLDeduplicator:
|
|
151
115
|
except Exception as e:
|
152
116
|
logger.error(
|
153
117
|
"date_range参数格式错误,应为['YYYY-MM-DD', 'YYYY-MM-DD'],已忽略时间范围。",
|
154
|
-
{"date_range":
|
118
|
+
{"date_range": date_range, "error": str(e)}
|
155
119
|
)
|
156
120
|
self._dedup_start_date = None
|
157
121
|
self._dedup_end_date = None
|
158
|
-
elif
|
122
|
+
elif recent_month:
|
159
123
|
today = datetime.today()
|
160
|
-
month = today.month -
|
124
|
+
month = today.month - recent_month
|
161
125
|
year = today.year
|
162
126
|
while month <= 0:
|
163
127
|
month += 12
|
@@ -165,16 +129,19 @@ class MySQLDeduplicator:
|
|
165
129
|
self._dedup_start_date = f"{year}-{month:02d}-01"
|
166
130
|
self._dedup_end_date = today.strftime("%Y-%m-%d")
|
167
131
|
|
132
|
+
# 排除列处理,直接合并去重
|
133
|
+
self.exclude_columns = list(set((exclude_columns or []) + ['id', '更新时间']))
|
134
|
+
|
168
135
|
# 线程安全控制
|
169
136
|
self._lock = threading.Lock()
|
170
137
|
self._processing_tables = set() # 正在处理的表集合
|
171
138
|
|
172
139
|
# 系统数据库列表
|
173
|
-
self.SYSTEM_DATABASES = {'information_schema', 'mysql', 'performance_schema', 'sys'}
|
140
|
+
self.SYSTEM_DATABASES = {'information_schema', 'mysql', 'performance_schema', 'sys', 'sakila'}
|
174
141
|
|
175
142
|
# 排除数据库和表的逻辑
|
176
|
-
self.exclude_databases = set(
|
177
|
-
self.exclude_tables = {k.lower(): set(
|
143
|
+
self.exclude_databases = set(db.lower() for db in (exclude_databases or []))
|
144
|
+
self.exclude_tables = {k.lower(): set(t.lower() for t in v) for k, v in (exclude_tables or {}).items()}
|
178
145
|
|
179
146
|
self.duplicate_keep_mode = duplicate_keep_mode if duplicate_keep_mode in ('keep_one', 'remove_all') else 'keep_one'
|
180
147
|
|
@@ -215,7 +182,7 @@ class MySQLDeduplicator:
|
|
215
182
|
last_exception = None
|
216
183
|
for attempt in range(self.max_retries + 1):
|
217
184
|
try:
|
218
|
-
logger.debug(f'调用{func.__name__},第{attempt+1}
|
185
|
+
logger.debug(f'调用{func.__name__},第{attempt+1}次连接', {'args': args, 'kwargs': kwargs})
|
219
186
|
return func(self, *args, **kwargs)
|
220
187
|
except (pymysql.OperationalError, pymysql.InterfaceError) as e:
|
221
188
|
last_exception = e
|
@@ -239,7 +206,7 @@ class MySQLDeduplicator:
|
|
239
206
|
@_retry_on_failure
|
240
207
|
def _get_databases(self) -> List[str]:
|
241
208
|
"""
|
242
|
-
获取所有非系统数据库列表,排除exclude_databases。
|
209
|
+
获取所有非系统数据库列表,排除 exclude_databases。
|
243
210
|
|
244
211
|
Returns:
|
245
212
|
List[str]: 数据库名列表。
|
@@ -256,7 +223,7 @@ class MySQLDeduplicator:
|
|
256
223
|
@_retry_on_failure
|
257
224
|
def _get_tables(self, database: str) -> List[str]:
|
258
225
|
"""
|
259
|
-
|
226
|
+
获取指定数据库的所有表名(排除 temp_ 前缀的临时表)。
|
260
227
|
|
261
228
|
Args:
|
262
229
|
database (str): 数据库名。
|
@@ -296,43 +263,11 @@ class MySQLDeduplicator:
|
|
296
263
|
return [row['COLUMN_NAME'] for row in cursor.fetchall()
|
297
264
|
if row['COLUMN_NAME'].lower() != self.primary_key.lower()]
|
298
265
|
|
299
|
-
def _acquire_table_lock(self, database: str, table: str) -> bool:
|
300
|
-
"""
|
301
|
-
获取表处理锁,防止并发处理同一张表。
|
302
|
-
|
303
|
-
Args:
|
304
|
-
database (str): 数据库名。
|
305
|
-
table (str): 表名。
|
306
|
-
Returns:
|
307
|
-
bool: 是否成功获取锁。
|
308
|
-
"""
|
309
|
-
key = f"{database}.{table}"
|
310
|
-
|
311
|
-
with self._lock:
|
312
|
-
if key in self._processing_tables:
|
313
|
-
logger.debug(f"表 {key} 正在被其他线程处理,跳过")
|
314
|
-
return False
|
315
|
-
self._processing_tables.add(key)
|
316
|
-
return True
|
317
|
-
|
318
|
-
def _release_table_lock(self, database: str, table: str) -> None:
|
319
|
-
"""
|
320
|
-
释放表处理锁。
|
321
|
-
|
322
|
-
Args:
|
323
|
-
database (str): 数据库名。
|
324
|
-
table (str): 表名。
|
325
|
-
"""
|
326
|
-
key = f"{database}.{table}"
|
327
|
-
|
328
|
-
with self._lock:
|
329
|
-
if key in self._processing_tables:
|
330
|
-
self._processing_tables.remove(key)
|
331
|
-
|
332
266
|
@_retry_on_failure
|
333
267
|
def _ensure_index(self, database: str, table: str, date_column: str) -> None:
|
334
268
|
"""
|
335
|
-
检查并为date_column自动创建索引(如果未存在)。
|
269
|
+
检查并为 date_column 自动创建索引(如果未存在)。
|
270
|
+
|
336
271
|
Args:
|
337
272
|
database (str): 数据库名。
|
338
273
|
table (str): 表名。
|
@@ -356,121 +291,241 @@ class MySQLDeduplicator:
|
|
356
291
|
try:
|
357
292
|
cursor.execute(f"CREATE INDEX `{safe_index_name}` ON `{database}`.`{table}` (`{date_column}`)")
|
358
293
|
conn.commit()
|
359
|
-
logger.
|
294
|
+
logger.debug('已自动为date_column创建索引', {"库": database, "表": table, "date_column": date_column, "索引名": safe_index_name})
|
360
295
|
except Exception as e:
|
361
296
|
logger.error('自动创建date_column索引失败', {"库": database, "表": table, "date_column": date_column, "异常": str(e)})
|
362
|
-
|
363
|
-
|
297
|
+
|
298
|
+
@_retry_on_failure
|
299
|
+
def _get_all_dates(self, database: str, table: str, date_column: str) -> List[str]:
|
300
|
+
"""
|
301
|
+
获取表中所有不同的日期分区(按天)。
|
302
|
+
|
303
|
+
Args:
|
304
|
+
database (str): 数据库名。
|
305
|
+
table (str): 表名。
|
306
|
+
date_column (str): 日期列名。
|
307
|
+
Returns:
|
308
|
+
List[str]: 所有不同的日期(字符串)。
|
309
|
+
"""
|
310
|
+
sql = f"SELECT DISTINCT `{date_column}` FROM `{database}`.`{table}` ORDER BY `{date_column}` ASC"
|
311
|
+
with self._get_connection() as conn:
|
312
|
+
with conn.cursor() as cursor:
|
313
|
+
cursor.execute(sql)
|
314
|
+
return [row[date_column] for row in cursor.fetchall() if row[date_column] is not None]
|
364
315
|
|
365
316
|
def _deduplicate_table(
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
317
|
+
self,
|
318
|
+
database: str,
|
319
|
+
table: str,
|
320
|
+
columns: Optional[List[str]] = None,
|
321
|
+
dry_run: bool = False,
|
322
|
+
use_python_dedup: bool = False,
|
323
|
+
date_val: Optional[str] = None,
|
324
|
+
lock_table: bool = True
|
372
325
|
) -> Tuple[int, int]:
|
373
326
|
"""
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
327
|
+
执行单表单天去重。只处理 date_val 这一天的数据(如果有 date_column),否则全表。
|
328
|
+
|
329
|
+
Args:
|
330
|
+
database (str): 数据库名。
|
331
|
+
table (str): 表名。
|
332
|
+
columns (Optional[List[str]]): 指定去重列。
|
333
|
+
dry_run (bool): 是否为模拟运行。
|
334
|
+
use_python_dedup (bool): 是否用 Python 方式去重。
|
335
|
+
date_val (Optional[str]): 指定处理的日期(如有 date_column)。
|
336
|
+
lock_table (bool): 是否加表级锁。
|
337
|
+
Returns:
|
338
|
+
Tuple[int, int]: (重复组数, 实际删除行数)
|
378
339
|
"""
|
379
|
-
if not self._acquire_table_lock(database, table):
|
340
|
+
if lock_table and not self._acquire_table_lock(database, table):
|
380
341
|
return (0, 0)
|
381
342
|
temp_table = None
|
382
343
|
try:
|
383
|
-
# 获取实际列名
|
384
344
|
all_columns = self._get_table_columns(database, table)
|
385
345
|
all_columns_lower = [col.lower() for col in all_columns]
|
386
346
|
exclude_columns_lower = [col.lower() for col in getattr(self, 'exclude_columns', [])]
|
387
347
|
time_col = self.date_column
|
388
348
|
time_col_lower = time_col.lower() if time_col else None
|
389
|
-
# 1. 跳过date_column在exclude_columns的情况
|
390
349
|
if time_col_lower and time_col_lower in exclude_columns_lower:
|
391
350
|
logger.warning('date_column在exclude_columns中,跳过该表', {"库": database, "表": table, "date_column": time_col, "exclude_columns": self.exclude_columns})
|
392
351
|
return (0, 0)
|
393
|
-
# 2. 判断表是否包含date_column
|
394
352
|
has_time_col = time_col_lower in all_columns_lower if time_col_lower else False
|
395
|
-
|
396
|
-
|
353
|
+
|
354
|
+
# 只要有date_column,始终分天处理(本函数只处理一天)
|
355
|
+
if has_time_col and date_val is not None:
|
397
356
|
self._ensure_index(database, table, time_col)
|
398
|
-
|
357
|
+
# 获取去重列
|
358
|
+
use_columns = columns or all_columns
|
359
|
+
use_columns = [col for col in use_columns if col.lower() in all_columns_lower and col.lower() not in exclude_columns_lower]
|
360
|
+
invalid_columns = set([col for col in (columns or []) if col.lower() not in all_columns_lower])
|
361
|
+
if invalid_columns:
|
362
|
+
logger.warning('不存在的列', {"库": database, "表": table, "不存在以下列": invalid_columns, 'func': sys._getframe().f_code.co_name})
|
363
|
+
if not use_columns:
|
364
|
+
logger.error('没有有效的去重列', {"库": database, "表": table, "func": sys._getframe().f_code.co_name})
|
365
|
+
return (0, 0)
|
366
|
+
pk = self.primary_key
|
367
|
+
pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
|
368
|
+
where_sql = f"t.`{time_col}` = '{date_val}'"
|
369
|
+
# 获取原始数据总量(只统计当天数据)
|
370
|
+
with self._get_connection() as conn:
|
371
|
+
with conn.cursor() as cursor:
|
372
|
+
count_where = f"WHERE `{time_col}` = '{date_val}'"
|
373
|
+
count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}` {count_where}"
|
374
|
+
logger.debug('执行SQL', {'sql': count_sql})
|
375
|
+
cursor.execute(count_sql)
|
376
|
+
total_count_row = cursor.fetchone()
|
377
|
+
total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
|
378
|
+
logger.debug('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name, "数据日期": date_val})
|
379
|
+
column_list = ', '.join([f'`{col}`' for col in use_columns])
|
380
|
+
|
381
|
+
# 用Python查找重复
|
382
|
+
if use_python_dedup:
|
383
|
+
select_cols = f'`{pk_real}`,' + ','.join([f'`{col}`' for col in use_columns])
|
384
|
+
select_where = f"WHERE `{time_col}` = '{date_val}'"
|
385
|
+
grouped = defaultdict(list)
|
386
|
+
for row in self._row_generator(database, table, select_cols, select_where, self.batch_size):
|
387
|
+
key = tuple(row[col] for col in use_columns)
|
388
|
+
grouped[key].append(row[pk_real])
|
389
|
+
dup_count = 0
|
390
|
+
del_ids = []
|
391
|
+
for ids in grouped.values():
|
392
|
+
if len(ids) > 1:
|
393
|
+
dup_count += 1
|
394
|
+
del_ids.extend(ids[1:])
|
395
|
+
affected_rows = 0
|
396
|
+
if not dry_run and del_ids:
|
397
|
+
with self._get_connection() as conn:
|
398
|
+
with conn.cursor() as cursor:
|
399
|
+
for i in range(0, len(del_ids), self.batch_size):
|
400
|
+
batch_ids = del_ids[i:i+self.batch_size]
|
401
|
+
del_ids_str = ','.join([str(i) for i in batch_ids])
|
402
|
+
delete_sql = f"DELETE FROM `{database}`.`{table}` WHERE `{pk_real}` IN ({del_ids_str})"
|
403
|
+
cursor.execute(delete_sql)
|
404
|
+
batch_deleted = cursor.rowcount
|
405
|
+
affected_rows += batch_deleted
|
406
|
+
conn.commit()
|
407
|
+
logger.debug('去重完成', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "Python", "数据处理": self.duplicate_keep_mode, "数据日期": date_val})
|
408
|
+
return (dup_count, affected_rows)
|
409
|
+
# SQL方式查找重复
|
410
|
+
temp_table = self._make_temp_table_name(table)
|
411
|
+
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
412
|
+
create_temp_where = f"WHERE `{time_col}` = '{date_val}'"
|
413
|
+
create_temp_sql = f"""
|
414
|
+
CREATE TABLE `{database}`.`{temp_table}` AS
|
415
|
+
SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
|
416
|
+
FROM `{database}`.`{table}`
|
417
|
+
{create_temp_where}
|
418
|
+
GROUP BY {column_list}
|
419
|
+
HAVING COUNT(*) > 1
|
420
|
+
"""
|
421
|
+
with self._get_connection() as conn:
|
422
|
+
with conn.cursor() as cursor:
|
423
|
+
logger.debug('创建临时表SQL', {'sql': create_temp_sql})
|
424
|
+
cursor.execute(create_temp_sql)
|
425
|
+
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`")
|
426
|
+
dup_count_row = cursor.fetchone()
|
427
|
+
dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
|
428
|
+
if dup_count == 0:
|
429
|
+
logger.debug('没有重复数据', {"库": database, "表": table, "数据量": total_count, "数据日期": date_val})
|
430
|
+
cursor.execute(drop_temp_sql)
|
431
|
+
conn.commit()
|
432
|
+
return (0, 0)
|
433
|
+
affected_rows = 0
|
434
|
+
if not dry_run:
|
435
|
+
while True:
|
436
|
+
where_clauses = []
|
437
|
+
if self.duplicate_keep_mode == 'keep_one':
|
438
|
+
where_clauses.append(f"t.`{pk_real}` <> tmp.`min_id`")
|
439
|
+
if where_sql.strip():
|
440
|
+
where_clauses.append(where_sql.strip())
|
441
|
+
where_full = "WHERE " + " AND ".join(where_clauses) if where_clauses else ""
|
442
|
+
find_dup_ids_sql = f"""
|
443
|
+
SELECT t.`{pk_real}` as del_id
|
444
|
+
FROM `{database}`.`{table}` t
|
445
|
+
JOIN `{database}`.`{temp_table}` tmp
|
446
|
+
ON {' AND '.join([f't.`{col}` <=> tmp.`{col}`' for col in use_columns])}
|
447
|
+
{where_full}
|
448
|
+
LIMIT {self.batch_size}
|
449
|
+
"""
|
450
|
+
logger.debug('查找待删除重复id SQL', {'sql': find_dup_ids_sql})
|
451
|
+
cursor.execute(find_dup_ids_sql)
|
452
|
+
del_ids = [row['del_id'] for row in cursor.fetchall()]
|
453
|
+
if not del_ids:
|
454
|
+
break
|
455
|
+
del_ids_str = ','.join([str(i) for i in del_ids])
|
456
|
+
delete_sql = f"DELETE FROM `{database}`.`{table}` WHERE `{pk_real}` IN ({del_ids_str})"
|
457
|
+
logger.debug('按id批量删除SQL', {'sql': delete_sql, 'ids': del_ids})
|
458
|
+
cursor.execute(delete_sql)
|
459
|
+
batch_deleted = cursor.rowcount
|
460
|
+
affected_rows += batch_deleted
|
461
|
+
conn.commit()
|
462
|
+
if batch_deleted == 0:
|
463
|
+
logger.warning('检测到未能删除任何数据,强制跳出循环,防止假死', {"库": database, "表": table})
|
464
|
+
break
|
465
|
+
if batch_deleted < self.batch_size:
|
466
|
+
break
|
467
|
+
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "SQL", "数据处理": self.duplicate_keep_mode, "数据日期": date_val})
|
468
|
+
else:
|
469
|
+
logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组": dup_count})
|
470
|
+
affected_rows = 0
|
471
|
+
cursor.execute(drop_temp_sql)
|
472
|
+
conn.commit()
|
473
|
+
return (dup_count, affected_rows)
|
474
|
+
# 没有date_column,处理全表
|
475
|
+
# ...existing code for full-table deduplication (as before, but without recursion)...
|
399
476
|
use_columns = columns or all_columns
|
400
477
|
use_columns = [col for col in use_columns if col.lower() in all_columns_lower and col.lower() not in exclude_columns_lower]
|
401
478
|
invalid_columns = set([col for col in (columns or []) if col.lower() not in all_columns_lower])
|
402
479
|
if invalid_columns:
|
403
480
|
logger.warning('不存在的列', {"库": database, "表": table, "不存在以下列": invalid_columns, 'func': sys._getframe().f_code.co_name})
|
404
481
|
if not use_columns:
|
405
|
-
logger.error('没有有效的去重列', {"库": database, "表": table})
|
482
|
+
logger.error('没有有效的去重列', {"库": database, "表": table, "func": sys._getframe().f_code.co_name})
|
406
483
|
return (0, 0)
|
407
484
|
pk = self.primary_key
|
408
485
|
pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
|
409
|
-
#
|
410
|
-
where_sql = ''
|
411
|
-
if has_time_col and self._dedup_start_date and self._dedup_end_date:
|
412
|
-
where_sql = f"t.`{time_col}` >= '{self._dedup_start_date}' AND t.`{time_col}` <= '{self._dedup_end_date}'"
|
413
|
-
# 获取原始数据总量(只统计区间内数据)
|
486
|
+
# 获取原始数据总量
|
414
487
|
with self._get_connection() as conn:
|
415
488
|
with conn.cursor() as cursor:
|
416
|
-
|
417
|
-
count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}` {count_where}"
|
489
|
+
count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`"
|
418
490
|
logger.debug('执行SQL', {'sql': count_sql})
|
419
491
|
cursor.execute(count_sql)
|
420
492
|
total_count_row = cursor.fetchone()
|
421
493
|
total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
|
422
|
-
logger.
|
494
|
+
logger.debug('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name})
|
423
495
|
column_list = ', '.join([f'`{col}`' for col in use_columns])
|
424
|
-
|
425
|
-
# 用Python查找重复
|
426
496
|
if use_python_dedup:
|
427
|
-
from collections import defaultdict
|
428
|
-
# 1. 拉取所有数据
|
429
497
|
select_cols = f'`{pk_real}`,' + ','.join([f'`{col}`' for col in use_columns])
|
430
|
-
select_where =
|
431
|
-
select_sql = f"SELECT {select_cols} FROM `{database}`.`{table}` {select_where}"
|
432
|
-
logger.debug('用Python查找重复,拉取数据SQL', {'sql': select_sql})
|
433
|
-
with self._get_connection() as conn:
|
434
|
-
with conn.cursor() as cursor:
|
435
|
-
cursor.execute(select_sql)
|
436
|
-
rows = cursor.fetchall()
|
437
|
-
# 2. 分组找重复
|
498
|
+
select_where = ''
|
438
499
|
grouped = defaultdict(list)
|
439
|
-
for row in
|
500
|
+
for row in self._row_generator(database, table, select_cols, select_where, self.batch_size):
|
440
501
|
key = tuple(row[col] for col in use_columns)
|
441
502
|
grouped[key].append(row[pk_real])
|
442
|
-
# 3. 统计重复组和待删除id
|
443
503
|
dup_count = 0
|
444
504
|
del_ids = []
|
445
505
|
for ids in grouped.values():
|
446
506
|
if len(ids) > 1:
|
447
507
|
dup_count += 1
|
448
|
-
del_ids.extend(ids[1:])
|
508
|
+
del_ids.extend(ids[1:])
|
449
509
|
affected_rows = 0
|
450
510
|
if not dry_run and del_ids:
|
451
511
|
with self._get_connection() as conn:
|
452
512
|
with conn.cursor() as cursor:
|
453
513
|
for i in range(0, len(del_ids), self.batch_size):
|
454
|
-
|
455
|
-
del_ids_str = ','.join([str(i) for i in
|
514
|
+
batch_ids = del_ids[i:i+self.batch_size]
|
515
|
+
del_ids_str = ','.join([str(i) for i in batch_ids])
|
456
516
|
delete_sql = f"DELETE FROM `{database}`.`{table}` WHERE `{pk_real}` IN ({del_ids_str})"
|
457
|
-
logger.debug('用Python分批删除SQL', {'sql': delete_sql, 'ids': batch})
|
458
517
|
cursor.execute(delete_sql)
|
459
518
|
batch_deleted = cursor.rowcount
|
460
519
|
affected_rows += batch_deleted
|
461
520
|
conn.commit()
|
462
|
-
logger.
|
521
|
+
logger.debug('去重完成', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "Python", "数据处理": self.duplicate_keep_mode})
|
463
522
|
return (dup_count, affected_rows)
|
464
|
-
|
465
|
-
temp_table = self._make_safe_table_name(table, prefix=f"temp_", suffix=f"_dedup_{os.getpid()}_{threading.get_ident()}")
|
523
|
+
temp_table = self._make_temp_table_name(table)
|
466
524
|
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
467
|
-
# 创建临时表时加where条件
|
468
|
-
create_temp_where = f"WHERE `{time_col}` >= '{self._dedup_start_date}' AND `{time_col}` <= '{self._dedup_end_date}'" if has_time_col and self._dedup_start_date and self._dedup_end_date else ''
|
469
525
|
create_temp_sql = f"""
|
470
526
|
CREATE TABLE `{database}`.`{temp_table}` AS
|
471
527
|
SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
|
472
528
|
FROM `{database}`.`{table}`
|
473
|
-
{create_temp_where}
|
474
529
|
GROUP BY {column_list}
|
475
530
|
HAVING COUNT(*) > 1
|
476
531
|
"""
|
@@ -482,7 +537,7 @@ class MySQLDeduplicator:
|
|
482
537
|
dup_count_row = cursor.fetchone()
|
483
538
|
dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
|
484
539
|
if dup_count == 0:
|
485
|
-
logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count
|
540
|
+
logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count})
|
486
541
|
cursor.execute(drop_temp_sql)
|
487
542
|
conn.commit()
|
488
543
|
return (0, 0)
|
@@ -492,8 +547,6 @@ class MySQLDeduplicator:
|
|
492
547
|
where_clauses = []
|
493
548
|
if self.duplicate_keep_mode == 'keep_one':
|
494
549
|
where_clauses.append(f"t.`{pk_real}` <> tmp.`min_id`")
|
495
|
-
if where_sql.strip():
|
496
|
-
where_clauses.append(where_sql.strip())
|
497
550
|
where_full = "WHERE " + " AND ".join(where_clauses) if where_clauses else ""
|
498
551
|
find_dup_ids_sql = f"""
|
499
552
|
SELECT t.`{pk_real}` as del_id
|
@@ -520,16 +573,15 @@ class MySQLDeduplicator:
|
|
520
573
|
break
|
521
574
|
if batch_deleted < self.batch_size:
|
522
575
|
break
|
523
|
-
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "
|
576
|
+
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "SQL", "数据处理": self.duplicate_keep_mode})
|
524
577
|
else:
|
525
|
-
logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "
|
578
|
+
logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组": dup_count})
|
526
579
|
affected_rows = 0
|
527
580
|
cursor.execute(drop_temp_sql)
|
528
581
|
conn.commit()
|
529
582
|
return (dup_count, affected_rows)
|
530
583
|
except Exception as e:
|
531
584
|
logger.error('异常', {"库": database, "表": table, "异常": str(e), 'func': sys._getframe().f_code.co_name, 'traceback': repr(e)})
|
532
|
-
# 异常时也要清理临时表
|
533
585
|
if temp_table:
|
534
586
|
try:
|
535
587
|
with self._get_connection() as conn:
|
@@ -541,29 +593,30 @@ class MySQLDeduplicator:
|
|
541
593
|
logger.error('异常时清理临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
542
594
|
return (0, 0)
|
543
595
|
finally:
|
544
|
-
|
596
|
+
if lock_table:
|
597
|
+
self._release_table_lock(database, table)
|
545
598
|
|
546
599
|
def deduplicate_table(
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
600
|
+
self,
|
601
|
+
database: str,
|
602
|
+
table: str,
|
603
|
+
columns: Optional[List[str]] = None,
|
604
|
+
dry_run: bool = False,
|
605
|
+
reorder_id: bool = False,
|
606
|
+
use_python_dedup: bool = True
|
554
607
|
) -> Tuple[int, int]:
|
555
608
|
"""
|
556
|
-
|
609
|
+
对指定表进行去重。始终按天分区(如有 date_column),否则全表。
|
557
610
|
|
558
611
|
Args:
|
559
612
|
database (str): 数据库名。
|
560
613
|
table (str): 表名。
|
561
|
-
columns (Optional[List[str]]):
|
562
|
-
dry_run (bool):
|
563
|
-
reorder_id (bool):
|
564
|
-
use_python_dedup (bool): 是否用Python
|
614
|
+
columns (Optional[List[str]]): 指定去重列。
|
615
|
+
dry_run (bool): 是否为模拟运行。
|
616
|
+
reorder_id (bool): 去重后是否自动重排 id 列。
|
617
|
+
use_python_dedup (bool): 是否用 Python 方式去重。
|
565
618
|
Returns:
|
566
|
-
Tuple[int, int]: (重复组数, 实际删除行数)
|
619
|
+
Tuple[int, int]: (重复组数, 实际删除行数)
|
567
620
|
"""
|
568
621
|
if database.lower() in self.exclude_tables and table.lower() in self.exclude_tables[database.lower()]:
|
569
622
|
logger.info('表被排除', {"库": database, "表": table, "操作": "跳过"})
|
@@ -572,10 +625,73 @@ class MySQLDeduplicator:
|
|
572
625
|
if not self._check_table_exists(database, table):
|
573
626
|
logger.warning('表不存在', {"库": database, "表": table, "warning": "跳过"})
|
574
627
|
return (0, 0)
|
575
|
-
logger.info('单表开始', {
|
576
|
-
|
577
|
-
|
578
|
-
|
628
|
+
logger.info('单表开始', {
|
629
|
+
"库": database,
|
630
|
+
"表": table,
|
631
|
+
"参数": {
|
632
|
+
"指定去重列": columns,
|
633
|
+
"去重方式": "Python" if use_python_dedup else "SQL",
|
634
|
+
"数据处理": self.duplicate_keep_mode,
|
635
|
+
"模拟运行": dry_run,
|
636
|
+
'排除列': self.exclude_columns,
|
637
|
+
}})
|
638
|
+
all_columns = self._get_table_columns(database, table)
|
639
|
+
all_columns_lower = [col.lower() for col in all_columns]
|
640
|
+
time_col = self.date_column
|
641
|
+
time_col_lower = time_col.lower() if time_col else None
|
642
|
+
has_time_col = time_col_lower in all_columns_lower if time_col_lower else False
|
643
|
+
if has_time_col:
|
644
|
+
self._ensure_index(database, table, time_col)
|
645
|
+
all_dates = self._get_all_dates(database, table, time_col)
|
646
|
+
# 按date_range/recent_month筛选日期
|
647
|
+
start_date = self._dedup_start_date
|
648
|
+
end_date = self._dedup_end_date
|
649
|
+
if start_date and end_date:
|
650
|
+
all_dates = [d for d in all_dates if str(start_date) <= str(d) <= str(end_date)]
|
651
|
+
if not all_dates:
|
652
|
+
logger.info('无可处理日期', {"库": database, "表": table})
|
653
|
+
return (0, 0)
|
654
|
+
total_dup = 0
|
655
|
+
total_del = 0
|
656
|
+
def process_date(date_val):
|
657
|
+
try:
|
658
|
+
logger.debug('按天分区去重', {"库": database, "表": table, "日期": date_val})
|
659
|
+
dup_count, affected_rows = self._deduplicate_table(
|
660
|
+
database, table, columns, dry_run, use_python_dedup,
|
661
|
+
date_val=date_val, lock_table=False
|
662
|
+
)
|
663
|
+
return (dup_count, affected_rows, date_val, None)
|
664
|
+
except Exception as e:
|
665
|
+
logger.error('分区去重异常', {"库": database, "表": table, "日期": date_val, "异常": str(e), "func": sys._getframe().f_code.co_name})
|
666
|
+
return (0, 0, date_val, str(e))
|
667
|
+
if self.max_workers > 1:
|
668
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
669
|
+
future_to_date = {executor.submit(process_date, date_val): date_val for date_val in all_dates}
|
670
|
+
for future in concurrent.futures.as_completed(future_to_date):
|
671
|
+
dup_count, affected_rows, date_val, err = future.result()
|
672
|
+
if err:
|
673
|
+
logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
|
674
|
+
total_dup += dup_count
|
675
|
+
total_del += affected_rows
|
676
|
+
else:
|
677
|
+
for date_val in all_dates:
|
678
|
+
dup_count, affected_rows, _, err = process_date(date_val)
|
679
|
+
if err:
|
680
|
+
logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
|
681
|
+
total_dup += dup_count
|
682
|
+
total_del += affected_rows
|
683
|
+
logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": (total_dup, total_del), '日期范围': f"{start_date} - {end_date}"})
|
684
|
+
# 自动重排id列(仅当有实际删除时且reorder_id为True)
|
685
|
+
if reorder_id and total_del > 0:
|
686
|
+
try:
|
687
|
+
reorder_ok = self.reorder_id_column(database, table, id_column=self.primary_key, dry_run=dry_run)
|
688
|
+
logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
|
689
|
+
except Exception as e:
|
690
|
+
logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
|
691
|
+
return (total_dup, total_del)
|
692
|
+
# 没有date_column,直接全表去重
|
693
|
+
result = self._deduplicate_table(database, table, columns, dry_run, use_python_dedup, date_val=None)
|
694
|
+
logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result, '日期范围': '全表'})
|
579
695
|
dup_count, affected_rows = result
|
580
696
|
if reorder_id and affected_rows > 0:
|
581
697
|
try:
|
@@ -589,28 +705,28 @@ class MySQLDeduplicator:
|
|
589
705
|
return (0, 0)
|
590
706
|
|
591
707
|
def deduplicate_database(
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
708
|
+
self,
|
709
|
+
database: str,
|
710
|
+
tables: Optional[List[str]] = None,
|
711
|
+
columns_map: Optional[Dict[str, List[str]]] = None,
|
712
|
+
dry_run: bool = False,
|
713
|
+
parallel: bool = False,
|
714
|
+
reorder_id: bool = False,
|
715
|
+
use_python_dedup: bool = True
|
600
716
|
) -> Dict[str, Tuple[int, int]]:
|
601
717
|
"""
|
602
|
-
|
718
|
+
对指定数据库的所有表进行去重。调用 deduplicate_table,自动适配分天。
|
603
719
|
|
604
720
|
Args:
|
605
721
|
database (str): 数据库名。
|
606
|
-
tables (Optional[List[str]]):
|
607
|
-
columns_map (Optional[Dict[str, List[str]]]):
|
722
|
+
tables (Optional[List[str]]): 指定表名列表。
|
723
|
+
columns_map (Optional[Dict[str, List[str]]]): 每个表的去重列映射。
|
608
724
|
dry_run (bool): 是否为模拟运行。
|
609
|
-
parallel (bool):
|
610
|
-
reorder_id (bool):
|
611
|
-
use_python_dedup (bool): 是否用Python
|
725
|
+
parallel (bool): 是否并行处理表。
|
726
|
+
reorder_id (bool): 去重后是否自动重排 id 列。
|
727
|
+
use_python_dedup (bool): 是否用 Python 方式去重。
|
612
728
|
Returns:
|
613
|
-
Dict[str, Tuple[int, int]]: {表名: (重复组数, 实际删除行数)}
|
729
|
+
Dict[str, Tuple[int, int]]: {表名: (重复组数, 实际删除行数)}
|
614
730
|
"""
|
615
731
|
results = {}
|
616
732
|
try:
|
@@ -626,8 +742,6 @@ class MySQLDeduplicator:
|
|
626
742
|
return results
|
627
743
|
logger.info('库统计', {"库": database, "表数量": len(target_tables), "表列表": target_tables})
|
628
744
|
if parallel and self.max_workers > 1:
|
629
|
-
logger.debug('并行处理表', {'库': database, 'max_workers': self.max_workers})
|
630
|
-
# 使用线程池并行处理
|
631
745
|
with concurrent.futures.ThreadPoolExecutor(
|
632
746
|
max_workers=self.max_workers
|
633
747
|
) as executor:
|
@@ -637,7 +751,7 @@ class MySQLDeduplicator:
|
|
637
751
|
logger.debug('提交表去重任务', {'库': database, '表': table, 'columns': columns})
|
638
752
|
futures[executor.submit(
|
639
753
|
self.deduplicate_table,
|
640
|
-
database, table, columns, dry_run, reorder_id,
|
754
|
+
database, table, columns, dry_run, reorder_id, use_python_dedup
|
641
755
|
)] = table
|
642
756
|
for future in concurrent.futures.as_completed(futures):
|
643
757
|
table = futures[future]
|
@@ -648,45 +762,43 @@ class MySQLDeduplicator:
|
|
648
762
|
logger.error('异常', {"库": database, "表": table, "error": str(e), 'traceback': repr(e)})
|
649
763
|
results[table] = (0, 0)
|
650
764
|
else:
|
651
|
-
logger.debug('串行处理表', {'库': database})
|
652
|
-
# 串行处理
|
653
765
|
for table in target_tables:
|
654
766
|
columns = columns_map.get(table) if columns_map else None
|
655
767
|
dup_count, affected_rows = self.deduplicate_table(
|
656
|
-
database, table, columns, dry_run, reorder_id,
|
768
|
+
database, table, columns, dry_run, reorder_id, use_python_dedup
|
657
769
|
)
|
658
770
|
results[table] = (dup_count, affected_rows)
|
659
771
|
total_dup = sum(r[0] for r in results.values())
|
660
772
|
total_del = sum(r[1] for r in results.values())
|
661
|
-
logger.info('
|
773
|
+
logger.info('库完成', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": results})
|
662
774
|
return results
|
663
775
|
except Exception as e:
|
664
776
|
logger.error('发生全局错误', {"库": database, 'func': sys._getframe().f_code.co_name, "error": str(e), 'traceback': repr(e)})
|
665
777
|
return results
|
666
778
|
|
667
779
|
def deduplicate_all(
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
780
|
+
self,
|
781
|
+
databases: Optional[List[str]] = None,
|
782
|
+
tables_map: Optional[Dict[str, List[str]]] = None,
|
783
|
+
columns_map: Optional[Dict[str, Dict[str, List[str]]]] = None,
|
784
|
+
dry_run: bool = False,
|
785
|
+
parallel: bool = False,
|
786
|
+
reorder_id: bool = False,
|
787
|
+
use_python_dedup: bool = True
|
676
788
|
) -> Dict[str, Dict[str, Tuple[int, int]]]:
|
677
789
|
"""
|
678
|
-
|
790
|
+
对所有数据库进行去重。调用 deduplicate_database,自动适配分天。
|
679
791
|
|
680
792
|
Args:
|
681
|
-
databases (Optional[List[str]]):
|
682
|
-
tables_map (Optional[Dict[str, List[str]]]):
|
683
|
-
columns_map (Optional[Dict[str, Dict[str, List[str]]]]):
|
684
|
-
dry_run (bool):
|
685
|
-
parallel (bool):
|
686
|
-
reorder_id (bool):
|
687
|
-
use_python_dedup (bool): 是否用Python
|
793
|
+
databases (Optional[List[str]]): 指定数据库名列表。
|
794
|
+
tables_map (Optional[Dict[str, List[str]]]): 每个库的表名映射。
|
795
|
+
columns_map (Optional[Dict[str, Dict[str, List[str]]]]): 每个库每个表的去重列映射。
|
796
|
+
dry_run (bool): 是否为模拟运行。
|
797
|
+
parallel (bool): 是否并行处理库。
|
798
|
+
reorder_id (bool): 去重后是否自动重排 id 列。
|
799
|
+
use_python_dedup (bool): 是否用 Python 方式去重。
|
688
800
|
Returns:
|
689
|
-
Dict[str, Dict[str, Tuple[int, int]]]:
|
801
|
+
Dict[str, Dict[str, Tuple[int, int]]]: {库: {表: (重复组数, 实际删除行数)}}
|
690
802
|
"""
|
691
803
|
all_results: Dict[str, Dict[str, Tuple[int, int]]] = defaultdict(dict)
|
692
804
|
try:
|
@@ -696,9 +808,18 @@ class MySQLDeduplicator:
|
|
696
808
|
if not target_dbs:
|
697
809
|
logger.warning('没有可处理的数据库')
|
698
810
|
return all_results
|
699
|
-
logger.info('全局开始', {
|
811
|
+
logger.info('全局开始', {
|
812
|
+
"数据库数量": len(target_dbs),
|
813
|
+
"数据库列表": target_dbs,
|
814
|
+
"参数": {
|
815
|
+
"模拟运行": dry_run,
|
816
|
+
"并行处理": parallel,
|
817
|
+
'排除列': self.exclude_columns,
|
818
|
+
'重排id': reorder_id,
|
819
|
+
'use_python_dedup': use_python_dedup
|
820
|
+
},
|
821
|
+
})
|
700
822
|
if parallel and self.max_workers > 1:
|
701
|
-
# 使用线程池并行处理多个数据库
|
702
823
|
with concurrent.futures.ThreadPoolExecutor(
|
703
824
|
max_workers=self.max_workers
|
704
825
|
) as executor:
|
@@ -708,7 +829,7 @@ class MySQLDeduplicator:
|
|
708
829
|
db_columns_map = columns_map.get(db) if columns_map else None
|
709
830
|
futures[executor.submit(
|
710
831
|
self.deduplicate_database,
|
711
|
-
db, tables, db_columns_map, dry_run, False, reorder_id,
|
832
|
+
db, tables, db_columns_map, dry_run, False, reorder_id, use_python_dedup
|
712
833
|
)] = db
|
713
834
|
for future in concurrent.futures.as_completed(futures):
|
714
835
|
db = futures[future]
|
@@ -719,12 +840,11 @@ class MySQLDeduplicator:
|
|
719
840
|
logger.error('异常', {"库": db, "error": str(e), 'traceback': repr(e)})
|
720
841
|
all_results[db] = {}
|
721
842
|
else:
|
722
|
-
# 串行处理数据库
|
723
843
|
for db in target_dbs:
|
724
844
|
tables = tables_map.get(db) if tables_map else None
|
725
845
|
db_columns_map = columns_map.get(db) if columns_map else None
|
726
846
|
db_results = self.deduplicate_database(
|
727
|
-
db, tables, db_columns_map, dry_run, parallel, reorder_id,
|
847
|
+
db, tables, db_columns_map, dry_run, parallel, reorder_id, use_python_dedup
|
728
848
|
)
|
729
849
|
all_results[db] = db_results
|
730
850
|
total_dup = sum(
|
@@ -735,7 +855,18 @@ class MySQLDeduplicator:
|
|
735
855
|
r[1] for db in all_results.values()
|
736
856
|
for r in db.values()
|
737
857
|
)
|
738
|
-
logger.info('全局完成', {
|
858
|
+
logger.info('全局完成', {
|
859
|
+
"总重复组": total_dup,
|
860
|
+
"总删除行": total_del,
|
861
|
+
"参数": {
|
862
|
+
"模拟运行": dry_run,
|
863
|
+
"并行处理": parallel,
|
864
|
+
'排除列': self.exclude_columns,
|
865
|
+
'重排id': reorder_id,
|
866
|
+
'use_python_dedup': use_python_dedup
|
867
|
+
},
|
868
|
+
"详细结果": dict(all_results)
|
869
|
+
})
|
739
870
|
return all_results
|
740
871
|
except Exception as e:
|
741
872
|
logger.error('异常', {"error": str(e), 'traceback': repr(e)})
|
@@ -780,6 +911,31 @@ class MySQLDeduplicator:
|
|
780
911
|
cursor.execute(sql, (database, table))
|
781
912
|
return bool(cursor.fetchone())
|
782
913
|
|
914
|
+
@_retry_on_failure
|
915
|
+
def _get_table_info(self, database: str, table: str, id_column: str = None):
|
916
|
+
"""
|
917
|
+
获取表的所有列名、主键列名列表、指定id列是否为主键。
|
918
|
+
Args:
|
919
|
+
database (str): 数据库名。
|
920
|
+
table (str): 表名。
|
921
|
+
id_column (str): id列名,默认使用self.primary_key。
|
922
|
+
Returns:
|
923
|
+
Tuple[List[str], List[str], bool]: (所有列名, 主键列名, id列是否为主键)
|
924
|
+
"""
|
925
|
+
id_column = id_column or self.primary_key
|
926
|
+
with self._get_connection() as conn:
|
927
|
+
with conn.cursor() as cursor:
|
928
|
+
cursor.execute("""
|
929
|
+
SELECT COLUMN_NAME, COLUMN_KEY
|
930
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
931
|
+
WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s
|
932
|
+
""", (database, table))
|
933
|
+
columns_info = cursor.fetchall()
|
934
|
+
columns = [row['COLUMN_NAME'] for row in columns_info]
|
935
|
+
pk_cols = [row['COLUMN_NAME'] for row in columns_info if row['COLUMN_KEY'] == 'PRI']
|
936
|
+
id_is_pk = any(row['COLUMN_NAME'].lower() == id_column.lower() and row['COLUMN_KEY'] in ('PRI', 'UNI') for row in columns_info)
|
937
|
+
return columns, pk_cols, id_is_pk
|
938
|
+
|
783
939
|
def close(self) -> None:
|
784
940
|
"""
|
785
941
|
关闭连接池。
|
@@ -828,15 +984,16 @@ class MySQLDeduplicator:
|
|
828
984
|
auto_drop_backup: bool = True
|
829
985
|
) -> Any:
|
830
986
|
"""
|
831
|
-
安全重排指定表或指定库下所有表的id列为顺序自增(1,2,3...)。
|
987
|
+
安全重排指定表或指定库下所有表的 id 列为顺序自增(1,2,3...)。
|
988
|
+
|
832
989
|
Args:
|
833
|
-
database (str):
|
834
|
-
table (Optional[str]): 表名,None
|
835
|
-
id_column (str): id列名,默认"id"
|
836
|
-
dry_run (bool):
|
837
|
-
auto_drop_backup (bool):
|
990
|
+
database (str): 数据库名。
|
991
|
+
table (Optional[str]): 表名,None 时批量处理该库所有表。
|
992
|
+
id_column (str): id 列名,默认 "id"。
|
993
|
+
dry_run (bool): 是否为模拟运行。
|
994
|
+
auto_drop_backup (bool): 校验通过后自动删除备份表。
|
838
995
|
Returns:
|
839
|
-
bool 或 dict: 单表时bool,批量时{表名: bool}
|
996
|
+
bool 或 dict: 单表时 bool,批量时 {表名: bool}
|
840
997
|
"""
|
841
998
|
if not table:
|
842
999
|
# 批量模式,对库下所有表执行
|
@@ -865,22 +1022,12 @@ class MySQLDeduplicator:
|
|
865
1022
|
if not self._check_table_exists(database, table):
|
866
1023
|
logger.warning('表不存在,跳过id重排', {"库": database, "表": table})
|
867
1024
|
return False
|
868
|
-
# 检查id
|
869
|
-
|
870
|
-
with conn.cursor() as cursor:
|
871
|
-
cursor.execute("""
|
872
|
-
SELECT COLUMN_NAME, COLUMN_KEY
|
873
|
-
FROM INFORMATION_SCHEMA.COLUMNS
|
874
|
-
WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s
|
875
|
-
""", (database, table))
|
876
|
-
columns_info = cursor.fetchall()
|
877
|
-
columns = [row['COLUMN_NAME'] for row in columns_info]
|
878
|
-
id_is_pk = any(row['COLUMN_NAME'].lower() == id_column.lower() and row['COLUMN_KEY'] in ('PRI', 'UNI') for row in columns_info)
|
1025
|
+
# 检查id列、主键信息(用_get_table_info)
|
1026
|
+
columns, pk_cols, id_is_pk = self._get_table_info(database, table, id_column)
|
879
1027
|
if id_column not in columns:
|
880
1028
|
logger.warning('表无id列,跳过id重排', {"库": database, "表": table})
|
881
1029
|
return False
|
882
1030
|
# 检查主键是否为单列id
|
883
|
-
pk_cols = [row['COLUMN_NAME'] for row in columns_info if row['COLUMN_KEY'] == 'PRI']
|
884
1031
|
if len(pk_cols) != 1 or pk_cols[0].lower() != id_column.lower():
|
885
1032
|
logger.warning('主键不是单列id,跳过id重排', {"库": database, "表": table, "主键列": pk_cols})
|
886
1033
|
return False
|
@@ -903,9 +1050,9 @@ class MySQLDeduplicator:
|
|
903
1050
|
if dry_run:
|
904
1051
|
logger.info('dry_run模式,打印原表结构', {"库": database, "表": table, "建表语句": create_table_sql})
|
905
1052
|
return True
|
906
|
-
temp_table = self.
|
1053
|
+
temp_table = self._make_temp_table_name(table)
|
907
1054
|
temp_table_quoted = f"`{database}`.`{temp_table}`"
|
908
|
-
backup_table = self.
|
1055
|
+
backup_table = self._make_backup_table_name(table)
|
909
1056
|
backup_table_quoted = f"`{database}`.`{backup_table}`"
|
910
1057
|
try:
|
911
1058
|
with self._get_connection() as conn:
|
@@ -1000,6 +1147,41 @@ class MySQLDeduplicator:
|
|
1000
1147
|
finally:
|
1001
1148
|
self._release_table_lock(database, table)
|
1002
1149
|
|
1150
|
+
def _acquire_table_lock(self, database: str, table: str, timeout: int = 60) -> bool:
|
1151
|
+
"""
|
1152
|
+
获取表级锁,防止多线程/多进程并发操作同一张表。
|
1153
|
+
Args:
|
1154
|
+
database (str): 数据库名。
|
1155
|
+
table (str): 表名。
|
1156
|
+
timeout (int): 等待锁的超时时间(秒)。
|
1157
|
+
Returns:
|
1158
|
+
bool: 是否成功获取锁。
|
1159
|
+
"""
|
1160
|
+
key = f"{database.lower()}::{table.lower()}"
|
1161
|
+
start_time = time.time()
|
1162
|
+
while True:
|
1163
|
+
with self._lock:
|
1164
|
+
if key not in self._processing_tables:
|
1165
|
+
self._processing_tables.add(key)
|
1166
|
+
return True
|
1167
|
+
if time.time() - start_time > timeout:
|
1168
|
+
logger.warning('获取表级锁超时', {"库": database, "表": table, "timeout": timeout})
|
1169
|
+
return False
|
1170
|
+
time.sleep(0.2)
|
1171
|
+
|
1172
|
+
def _release_table_lock(self, database: str, table: str) -> None:
|
1173
|
+
"""
|
1174
|
+
释放表级锁。
|
1175
|
+
Args:
|
1176
|
+
database (str): 数据库名。
|
1177
|
+
table (str): 表名。
|
1178
|
+
Returns:
|
1179
|
+
None
|
1180
|
+
"""
|
1181
|
+
key = f"{database.lower()}::{table.lower()}"
|
1182
|
+
with self._lock:
|
1183
|
+
self._processing_tables.discard(key)
|
1184
|
+
|
1003
1185
|
@staticmethod
|
1004
1186
|
def _make_safe_table_name(base: str, prefix: str = '', suffix: str = '', max_length: int = 64) -> str:
|
1005
1187
|
"""
|
@@ -1020,6 +1202,20 @@ class MySQLDeduplicator:
|
|
1020
1202
|
return (prefix + suffix)[:max_length]
|
1021
1203
|
return f"{prefix}{base[:remain]}{suffix}"[:max_length]
|
1022
1204
|
|
1205
|
+
def _make_temp_table_name(self, base: str) -> str:
|
1206
|
+
"""
|
1207
|
+
生成临时表名,带有 temp_ 前缀和 _dedup_ 进程线程后缀。
|
1208
|
+
"""
|
1209
|
+
suffix = f"_dedup_{os.getpid()}_{threading.get_ident()}"
|
1210
|
+
return self._make_safe_table_name(base, prefix="temp_", suffix=suffix)
|
1211
|
+
|
1212
|
+
def _make_backup_table_name(self, base: str) -> str:
|
1213
|
+
"""
|
1214
|
+
生成备份表名,带有 backup_ 前缀和时间戳+uuid后缀。
|
1215
|
+
"""
|
1216
|
+
suffix = f"_{int(time.time())}_{uuid.uuid4().hex[:8]}"
|
1217
|
+
return self._make_safe_table_name(base, prefix="backup_", suffix=suffix)
|
1218
|
+
|
1023
1219
|
|
1024
1220
|
def main():
|
1025
1221
|
deduplicator = MySQLDeduplicator(
|
@@ -1027,18 +1223,42 @@ def main():
|
|
1027
1223
|
password='pwd',
|
1028
1224
|
host='localhost',
|
1029
1225
|
port=3306,
|
1030
|
-
|
1031
|
-
|
1226
|
+
max_workers= 2,
|
1227
|
+
batch_size=1000,
|
1228
|
+
skip_system_dbs=True,
|
1229
|
+
max_retries=3,
|
1230
|
+
retry_interval=5,
|
1231
|
+
pool_size=5,
|
1232
|
+
recent_month=1,
|
1233
|
+
# date_range=['2025-06-09', '2025-06-10'],
|
1234
|
+
date_column='日期',
|
1235
|
+
exclude_columns=None,
|
1236
|
+
exclude_databases=['测试库4'],
|
1237
|
+
exclude_tables={
|
1238
|
+
'推广数据2': [
|
1239
|
+
'地域报表_城市_2025_04',
|
1240
|
+
'地域报表_城市_2025_05',
|
1241
|
+
'地域报表_城市_2025_06',
|
1242
|
+
# '地域报表_城市_2025_04_copy1',
|
1243
|
+
# '地域报表_城市_2025_05_copy1',
|
1244
|
+
# '地域报表_城市_2025_06_copy1',
|
1245
|
+
'奥莱店_主体报表',
|
1246
|
+
# '奥莱店_主体报表_copy1',
|
1247
|
+
],
|
1248
|
+
"生意参谋3": [
|
1249
|
+
"商品排行_2025",
|
1250
|
+
],
|
1251
|
+
},
|
1032
1252
|
)
|
1033
1253
|
|
1034
1254
|
# 全库去重(单线程)
|
1035
1255
|
deduplicator.deduplicate_all(dry_run=False, parallel=True, reorder_id=True)
|
1036
1256
|
|
1037
1257
|
# # 指定数据库去重(多线程)
|
1038
|
-
# deduplicator.deduplicate_database('
|
1258
|
+
# deduplicator.deduplicate_database('推广数据2', dry_run=False, parallel=True, reorder_id=True)
|
1039
1259
|
|
1040
1260
|
# # 指定表去重(使用特定列)
|
1041
|
-
# deduplicator.deduplicate_table('
|
1261
|
+
# deduplicator.deduplicate_table('推广数据2', '地域报表_城市_2025_06_copy1', columns=[], dry_run=False, reorder_id=True)
|
1042
1262
|
|
1043
1263
|
# # 重排id列
|
1044
1264
|
# deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
|
@@ -1047,5 +1267,5 @@ def main():
|
|
1047
1267
|
deduplicator.close()
|
1048
1268
|
|
1049
1269
|
if __name__ == '__main__':
|
1050
|
-
main()
|
1270
|
+
# main()
|
1051
1271
|
pass
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=GrY3av2BYeEaosI2qWYizQyTwyijdq8IuOuFjTJqLxE,19
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -8,7 +8,7 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
|
8
8
|
mdbq/log/mylogger.py,sha256=Crw6LwVo3I3IUbzIETu8f46Quza3CTCh-qYf4edbBPo,24139
|
9
9
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
|
-
mdbq/mysql/deduplicator.py,sha256=
|
11
|
+
mdbq/mysql/deduplicator.py,sha256=e84MLhWjdCoDB8GxUV-z5drn8hdKGlJKnHzNW0rjIM8,65345
|
12
12
|
mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
|
13
13
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
14
14
|
mdbq/mysql/uploader.py,sha256=8Px_W2bYOr1wQgMXMK0DggNiuE6a6Ul4BlJake8LSo8,64469
|
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
24
24
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
25
25
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
26
26
|
mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
|
27
|
-
mdbq-3.11.
|
28
|
-
mdbq-3.11.
|
29
|
-
mdbq-3.11.
|
30
|
-
mdbq-3.11.
|
27
|
+
mdbq-3.11.11.dist-info/METADATA,sha256=NHTu8tsBwtvh90jaiNN4E4i9SW5xkH6P-yYcBrxwSbU,365
|
28
|
+
mdbq-3.11.11.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-3.11.11.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-3.11.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|