mdbq 3.9.6__py3-none-any.whl → 3.9.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,595 @@
1
+ # -*- coding:utf-8 -*-
2
+ import datetime
3
+ import re
4
+ import time
5
+ from functools import wraps
6
+ import warnings
7
+ import pymysql
8
+ import os
9
+ from mdbq.log import mylogger
10
+ from typing import List, Dict, Optional, Any, Tuple, Set
11
+ from dbutils.pooled_db import PooledDB
12
+ import threading
13
+ import concurrent.futures
14
+ from collections import defaultdict
15
+
16
+
17
+ warnings.filterwarnings('ignore')
18
+ logger = mylogger.MyLogger(
19
+ name='deduplicator',
20
+ logging_mode='none',
21
+ log_level='error',
22
+ log_file='deduplicator.log',
23
+ log_format='json',
24
+ max_log_size=50,
25
+ backup_count=5,
26
+ enable_async=False, # 是否启用异步日志
27
+ sample_rate=0.5, # 采样50%的DEBUG/INFO日志
28
+ sensitive_fields=[], # 敏感字段列表
29
+ )
30
+
31
+
32
+ class MySQLDeduplicator:
33
+ """
34
+ MySQL数据去重
35
+
36
+ 功能:
37
+ 1. 自动检测并删除MySQL数据库中的重复数据
38
+ 2. 支持全库扫描或指定表处理
39
+ 3. 支持多线程/多进程安全处理
40
+ 4. 完善的错误处理和日志记录
41
+
42
+ 使用示例:
43
+ deduplicator = MySQLDeduplicator(
44
+ username='root',
45
+ password='password',
46
+ host='localhost',
47
+ port=3306
48
+ )
49
+
50
+ # 全库去重
51
+ deduplicator.deduplicate_all()
52
+
53
+ # 指定数据库去重(多线程)
54
+ deduplicator.deduplicate_database('my_db', parallel=True)
55
+
56
+ # 指定表去重(使用特定列)
57
+ deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
58
+
59
+ # 关闭连接
60
+ deduplicator.close()
61
+ """
62
+
63
+ def __init__(
64
+ self,
65
+ username: str,
66
+ password: str,
67
+ host: str = 'localhost',
68
+ port: int = 3306,
69
+ charset: str = 'utf8mb4',
70
+ max_workers: int = 1,
71
+ batch_size: int = 1000,
72
+ skip_system_dbs: bool = True,
73
+ max_retries: int = 3,
74
+ retry_interval: int = 5,
75
+ pool_size: int = 5
76
+ ):
77
+ """
78
+ 初始化去重处理器
79
+
80
+ :param username: 数据库用户名
81
+ :param password: 数据库密码
82
+ :param host: 数据库主机,默认为localhost
83
+ :param port: 数据库端口,默认为3306
84
+ :param charset: 字符集,默认为utf8mb4
85
+ :param max_workers: 最大工作线程数,默认为1(单线程)
86
+ :param batch_size: 批量处理大小,默认为1000
87
+ :param skip_system_dbs: 是否跳过系统数据库,默认为True
88
+ :param max_retries: 最大重试次数
89
+ :param retry_interval: 重试间隔(秒)
90
+ :param pool_size: 连接池大小
91
+ """
92
+ # 初始化连接池
93
+ self.pool = PooledDB(
94
+ creator=pymysql,
95
+ host=host,
96
+ port=port,
97
+ user=username,
98
+ password=password,
99
+ charset=charset,
100
+ maxconnections=pool_size,
101
+ cursorclass=pymysql.cursors.DictCursor
102
+ )
103
+
104
+ # 配置参数
105
+ self.max_workers = max(1, min(max_workers, 20)) # 限制最大线程数
106
+ self.batch_size = batch_size
107
+ self.skip_system_dbs = skip_system_dbs
108
+ self.max_retries = max_retries
109
+ self.retry_interval = retry_interval
110
+
111
+ # 线程安全控制
112
+ self._lock = threading.Lock()
113
+ self._processing_tables = set() # 正在处理的表集合
114
+
115
+ # 系统数据库列表
116
+ self.SYSTEM_DATABASES = {
117
+ 'information_schema', 'mysql',
118
+ 'performance_schema', 'sys'
119
+ }
120
+
121
+ def _get_connection(self):
122
+ """从连接池获取连接"""
123
+ try:
124
+ conn = self.pool.connection()
125
+ logger.debug("成功获取数据库连接")
126
+ return conn
127
+ except Exception as e:
128
+ logger.error(f"获取数据库连接失败: {str(e)}")
129
+ raise ConnectionError(f"连接数据库失败: {str(e)}")
130
+
131
+ @staticmethod
132
+ def _retry_on_failure(func):
133
+ """重试装饰器"""
134
+
135
+ @wraps(func)
136
+ def wrapper(self, *args, **kwargs):
137
+ last_exception = None
138
+ for attempt in range(self.max_retries + 1):
139
+ try:
140
+ return func(self, *args, **kwargs)
141
+ except (pymysql.OperationalError, pymysql.InterfaceError) as e:
142
+ last_exception = e
143
+ if attempt < self.max_retries:
144
+ wait_time = self.retry_interval * (attempt + 1)
145
+ logger.warning(
146
+ f"数据库操作失败,准备重试 (尝试 {attempt + 1}/{self.max_retries})",
147
+ {'error': str(e), 'wait_time': wait_time})
148
+ time.sleep(wait_time)
149
+ continue
150
+ except Exception as e:
151
+ last_exception = e
152
+ logger.error(f"操作失败: {str(e)}", {'error_type': type(e).__name__})
153
+ break
154
+
155
+ if last_exception:
156
+ raise last_exception
157
+ raise Exception("未知错误")
158
+
159
+ return wrapper
160
+
161
+ @_retry_on_failure
162
+ def _get_databases(self) -> List[str]:
163
+ """获取所有非系统数据库列表"""
164
+ sql = "SHOW DATABASES"
165
+
166
+ with self._get_connection() as conn:
167
+ with conn.cursor() as cursor:
168
+ cursor.execute(sql)
169
+ all_dbs = [row['Database'] for row in cursor.fetchall()]
170
+
171
+ if self.skip_system_dbs:
172
+ return [db for db in all_dbs if db.lower() not in self.SYSTEM_DATABASES]
173
+ return all_dbs
174
+
175
+ @_retry_on_failure
176
+ def _get_tables(self, database: str) -> List[str]:
177
+ """获取指定数据库的所有表"""
178
+ sql = "SHOW TABLES"
179
+
180
+ with self._get_connection() as conn:
181
+ with conn.cursor() as cursor:
182
+ cursor.execute(f"USE `{database}`")
183
+ cursor.execute(sql)
184
+ return [row[f'Tables_in_{database}'] for row in cursor.fetchall()]
185
+
186
+ @_retry_on_failure
187
+ def _get_table_columns(self, database: str, table: str) -> List[str]:
188
+ """获取表的列名(排除id列)"""
189
+ sql = """
190
+ SELECT COLUMN_NAME
191
+ FROM INFORMATION_SCHEMA.COLUMNS
192
+ WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
193
+ ORDER BY ORDINAL_POSITION
194
+ """
195
+
196
+ with self._get_connection() as conn:
197
+ with conn.cursor() as cursor:
198
+ cursor.execute(sql, (database, table))
199
+ return [row['COLUMN_NAME'] for row in cursor.fetchall()
200
+ if row['COLUMN_NAME'].lower() != 'id']
201
+
202
+ def _acquire_table_lock(self, database: str, table: str) -> bool:
203
+ """获取表处理锁,防止并发处理同一张表"""
204
+ key = f"{database}.{table}"
205
+
206
+ with self._lock:
207
+ if key in self._processing_tables:
208
+ logger.debug(f"表 {key} 正在被其他线程处理,跳过")
209
+ return False
210
+ self._processing_tables.add(key)
211
+ return True
212
+
213
+ def _release_table_lock(self, database: str, table: str):
214
+ """释放表处理锁"""
215
+ key = f"{database}.{table}"
216
+
217
+ with self._lock:
218
+ if key in self._processing_tables:
219
+ self._processing_tables.remove(key)
220
+
221
+ def _deduplicate_table(
222
+ self,
223
+ database: str,
224
+ table: str,
225
+ columns: Optional[List[str]] = None,
226
+ dry_run: bool = False
227
+ ) -> Tuple[int, int]:
228
+ """
229
+ 执行单表去重
230
+
231
+ :param database: 数据库名
232
+ :param table: 表名
233
+ :param columns: 用于去重的列(为None时使用所有列)
234
+ :param dry_run: 是否模拟运行(只统计不实际删除)
235
+ :return: (重复行数, 删除行数)
236
+ """
237
+ if not self._acquire_table_lock(database, table):
238
+ return (0, 0)
239
+
240
+ try:
241
+ logger.info(f"开始处理表: {database}.{table}")
242
+
243
+ # 获取实际列名
244
+ all_columns = self._get_table_columns(database, table)
245
+ if not all_columns:
246
+ logger.warning(f"表 {database}.{table} 没有有效列(可能只有id列),跳过")
247
+ return (0, 0)
248
+
249
+ # 使用指定列或所有列
250
+ use_columns = columns or all_columns
251
+ invalid_columns = set(use_columns) - set(all_columns)
252
+
253
+ if invalid_columns:
254
+ logger.warning(
255
+ f"表 {database}.{table} 中不存在以下列: {invalid_columns},使用有效列",
256
+ {'invalid_columns': invalid_columns}
257
+ )
258
+ use_columns = [col for col in use_columns if col in all_columns]
259
+
260
+ if not use_columns:
261
+ logger.error(f"表 {database}.{table} 没有有效的去重列")
262
+ return (0, 0)
263
+
264
+ # 构建去重SQL
265
+ column_list = ', '.join([f'`{col}`' for col in use_columns])
266
+ temp_table = f"temp_{table}_{int(time.time())}"
267
+
268
+ # 使用临时表方案处理去重,避免锁表问题
269
+ create_temp_sql = f"""
270
+ CREATE TABLE `{database}`.`{temp_table}` AS
271
+ SELECT MIN(`id`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
272
+ FROM `{database}`.`{table}`
273
+ GROUP BY {column_list}
274
+ HAVING COUNT(*) > 1
275
+ """
276
+
277
+ delete_dup_sql = f"""
278
+ DELETE FROM `{database}`.`{table}`
279
+ WHERE `id` NOT IN (
280
+ SELECT `min_id` FROM `{database}`.`{temp_table}`
281
+ ) AND ({' OR '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
282
+ """
283
+
284
+ drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
285
+
286
+ with self._get_connection() as conn:
287
+ with conn.cursor() as cursor:
288
+ # 创建临时表统计重复数据
289
+ cursor.execute(create_temp_sql)
290
+ cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`")
291
+ dup_count = cursor.fetchone()['cnt']
292
+
293
+ if dup_count == 0:
294
+ logger.info(f"表 {database}.{table} 没有重复数据")
295
+ cursor.execute(drop_temp_sql)
296
+ conn.commit()
297
+ return (0, 0)
298
+
299
+ logger.info(
300
+ f"表 {database}.{table} 发现 {dup_count} 组重复数据",
301
+ {'columns': use_columns}
302
+ )
303
+
304
+ if not dry_run:
305
+ # 执行实际删除
306
+ cursor.execute(delete_dup_sql)
307
+ affected_rows = cursor.rowcount
308
+ conn.commit()
309
+ logger.info(
310
+ f"表 {database}.{table} 已删除 {affected_rows} 行重复数据",
311
+ {'columns': use_columns}
312
+ )
313
+ else:
314
+ affected_rows = 0
315
+ logger.info(
316
+ f"[模拟运行] 表 {database}.{table} 将删除 {dup_count} 组重复数据",
317
+ {'columns': use_columns}
318
+ )
319
+
320
+ # 清理临时表
321
+ cursor.execute(drop_temp_sql)
322
+ conn.commit()
323
+
324
+ return (dup_count, affected_rows)
325
+
326
+ except Exception as e:
327
+ logger.error(
328
+ f"处理表 {database}.{table} 时出错: {str(e)}",
329
+ {'error_type': type(e).__name__}
330
+ )
331
+ return (0, 0)
332
+ finally:
333
+ self._release_table_lock(database, table)
334
+
335
+ def deduplicate_table(
336
+ self,
337
+ database: str,
338
+ table: str,
339
+ columns: Optional[List[str]] = None,
340
+ dry_run: bool = False
341
+ ) -> Tuple[int, int]:
342
+ """
343
+ 对指定表进行去重
344
+
345
+ :param database: 数据库名
346
+ :param table: 表名
347
+ :param columns: 用于去重的列(为None时使用所有列)
348
+ :param dry_run: 是否模拟运行(只统计不实际删除)
349
+ :return: (重复行数, 删除行数)
350
+ """
351
+ try:
352
+ # 检查表是否存在
353
+ if not self._check_table_exists(database, table):
354
+ logger.warning(f"表 {database}.{table} 不存在,跳过")
355
+ return (0, 0)
356
+
357
+ return self._deduplicate_table(database, table, columns, dry_run)
358
+ except Exception as e:
359
+ logger.error(
360
+ f"处理表 {database}.{table} 时发生全局错误: {str(e)}",
361
+ {'error_type': type(e).__name__}
362
+ )
363
+ return (0, 0)
364
+
365
+ def deduplicate_database(
366
+ self,
367
+ database: str,
368
+ tables: Optional[List[str]] = None,
369
+ columns_map: Optional[Dict[str, List[str]]] = None,
370
+ dry_run: bool = False,
371
+ parallel: bool = False
372
+ ) -> Dict[str, Tuple[int, int]]:
373
+ """
374
+ 对指定数据库的所有表进行去重
375
+
376
+ :param database: 数据库名
377
+ :param tables: 要处理的表列表(为None时处理所有表)
378
+ :param columns_map: 各表使用的去重列 {表名: [列名]}
379
+ :param dry_run: 是否模拟运行
380
+ :param parallel: 是否并行处理
381
+ :return: 字典 {表名: (重复行数, 删除行数)}
382
+ """
383
+ results = {}
384
+
385
+ try:
386
+ # 检查数据库是否存在
387
+ if not self._check_database_exists(database):
388
+ logger.warning(f"数据库 {database} 不存在,跳过")
389
+ return results
390
+
391
+ # 获取要处理的表
392
+ target_tables = tables or self._get_tables(database)
393
+ if not target_tables:
394
+ logger.info(f"数据库 {database} 中没有表,跳过")
395
+ return results
396
+
397
+ logger.info(
398
+ f"开始处理数据库 {database} 中的 {len(target_tables)} 张表",
399
+ {'tables': target_tables}
400
+ )
401
+
402
+ if parallel and self.max_workers > 1:
403
+ # 并行处理
404
+ with concurrent.futures.ThreadPoolExecutor(
405
+ max_workers=self.max_workers
406
+ ) as executor:
407
+ futures = {}
408
+ for table in target_tables:
409
+ columns = columns_map.get(table) if columns_map else None
410
+ futures[executor.submit(
411
+ self.deduplicate_table,
412
+ database, table, columns, dry_run
413
+ )] = table
414
+
415
+ for future in concurrent.futures.as_completed(futures):
416
+ table = futures[future]
417
+ try:
418
+ dup_count, affected_rows = future.result()
419
+ results[table] = (dup_count, affected_rows)
420
+ except Exception as e:
421
+ logger.error(
422
+ f"处理表 {database}.{table} 时出错: {str(e)}",
423
+ {'error_type': type(e).__name__}
424
+ )
425
+ results[table] = (0, 0)
426
+ else:
427
+ # 串行处理
428
+ for table in target_tables:
429
+ columns = columns_map.get(table) if columns_map else None
430
+ dup_count, affected_rows = self.deduplicate_table(
431
+ database, table, columns, dry_run
432
+ )
433
+ results[table] = (dup_count, affected_rows)
434
+
435
+ # 统计结果
436
+ total_dup = sum(r[0] for r in results.values())
437
+ total_del = sum(r[1] for r in results.values())
438
+
439
+ logger.info(
440
+ f"数据库 {database} 处理完成 - 共发现 {total_dup} 组重复数据,删除 {total_del} 行",
441
+ {'results': results}
442
+ )
443
+
444
+ return results
445
+
446
+ except Exception as e:
447
+ logger.error(f"处理数据库 {database} 时发生全局错误: {str(e)}", {'error_type': type(e).__name__})
448
+ return results
449
+
450
+ def deduplicate_all(
451
+ self,
452
+ databases: Optional[List[str]] = None,
453
+ tables_map: Optional[Dict[str, List[str]]] = None,
454
+ columns_map: Optional[Dict[str, Dict[str, List[str]]]] = None,
455
+ dry_run: bool = False,
456
+ parallel: bool = False
457
+ ) -> Dict[str, Dict[str, Tuple[int, int]]]:
458
+ """
459
+ 对所有数据库进行去重
460
+
461
+ :param databases: 要处理的数据库列表(为None时处理所有非系统数据库)
462
+ :param tables_map: 各数据库要处理的表 {数据库名: [表名]}
463
+ :param columns_map: 各表使用的去重列 {数据库名: {表名: [列名]}}
464
+ :param dry_run: 是否模拟运行
465
+ :param parallel: 是否并行处理
466
+ :return: 嵌套字典 {数据库名: {表名: (重复行数, 删除行数)}}
467
+ """
468
+ all_results = defaultdict(dict)
469
+
470
+ try:
471
+ # 获取要处理的数据库
472
+ target_dbs = databases or self._get_databases()
473
+ if not target_dbs:
474
+ logger.warning("没有可处理的数据库")
475
+ return all_results
476
+
477
+ logger.info(f"开始处理 {len(target_dbs)} 个数据库", {'databases': target_dbs})
478
+
479
+ if parallel and self.max_workers > 1:
480
+ # 并行处理数据库
481
+ with concurrent.futures.ThreadPoolExecutor(
482
+ max_workers=self.max_workers
483
+ ) as executor:
484
+ futures = {}
485
+ for db in target_dbs:
486
+ tables = tables_map.get(db) if tables_map else None
487
+ db_columns_map = columns_map.get(db) if columns_map else None
488
+ futures[executor.submit(
489
+ self.deduplicate_database,
490
+ db, tables, db_columns_map, dry_run, False
491
+ )] = db
492
+
493
+ for future in concurrent.futures.as_completed(futures):
494
+ db = futures[future]
495
+ try:
496
+ db_results = future.result()
497
+ all_results[db] = db_results
498
+ except Exception as e:
499
+ logger.error(f"处理数据库 {db} 时出错: {str(e)}", {'error_type': type(e).__name__})
500
+ all_results[db] = {}
501
+ else:
502
+ # 串行处理数据库
503
+ for db in target_dbs:
504
+ tables = tables_map.get(db) if tables_map else None
505
+ db_columns_map = columns_map.get(db) if columns_map else None
506
+ db_results = self.deduplicate_database(
507
+ db, tables, db_columns_map, dry_run, parallel
508
+ )
509
+ all_results[db] = db_results
510
+
511
+ # 统计总体结果
512
+ total_dup = sum(
513
+ r[0] for db in all_results.values()
514
+ for r in db.values()
515
+ )
516
+ total_del = sum(
517
+ r[1] for db in all_results.values()
518
+ for r in db.values()
519
+ )
520
+
521
+ logger.info(
522
+ f"所有数据库处理完成 - 共发现 {total_dup} 组重复数据,删除 {total_del} 行",
523
+ {'total_results': all_results}
524
+ )
525
+
526
+ return all_results
527
+
528
+ except Exception as e:
529
+ logger.error(f"全局处理时发生错误: {str(e)}", {'error_type': type(e).__name__})
530
+ return all_results
531
+
532
+ @_retry_on_failure
533
+ def _check_database_exists(self, database: str) -> bool:
534
+ """检查数据库是否存在"""
535
+ sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
536
+
537
+ with self._get_connection() as conn:
538
+ with conn.cursor() as cursor:
539
+ cursor.execute(sql, (database,))
540
+ return bool(cursor.fetchone())
541
+
542
+ @_retry_on_failure
543
+ def _check_table_exists(self, database: str, table: str) -> bool:
544
+ """检查表是否存在"""
545
+ sql = """
546
+ SELECT TABLE_NAME
547
+ FROM INFORMATION_SCHEMA.TABLES
548
+ WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
549
+ """
550
+
551
+ with self._get_connection() as conn:
552
+ with conn.cursor() as cursor:
553
+ cursor.execute(sql, (database, table))
554
+ return bool(cursor.fetchone())
555
+
556
+ def close(self):
557
+ """关闭连接池"""
558
+ try:
559
+ if hasattr(self, 'pool') and self.pool:
560
+ self.pool.close()
561
+ logger.info("数据库连接池已关闭")
562
+ except Exception as e:
563
+ logger.error(f"关闭连接池时出错: {str(e)}", {'error_type': type(e).__name__})
564
+ finally:
565
+ self.pool = None
566
+
567
+ def __enter__(self):
568
+ return self
569
+
570
+ def __exit__(self, exc_type, exc_val, exc_tb):
571
+ self.close()
572
+
573
+
574
+ def main():
575
+ deduplicator = MySQLDeduplicator(
576
+ username='root',
577
+ password='pw',
578
+ host='localhost',
579
+ port=3306
580
+ )
581
+
582
+ # 全库去重(单线程)
583
+ deduplicator.deduplicate_all()
584
+
585
+ # # 指定数据库去重(多线程)
586
+ # deduplicator.deduplicate_database('my_db', parallel=True)
587
+
588
+ # # 指定表去重(使用特定列)
589
+ # deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
590
+
591
+ # 关闭连接
592
+ deduplicator.close()
593
+
594
+ if __name__ == '__main__':
595
+ main()