mdbq 3.10.8__py3-none-any.whl → 3.10.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.10.8'
1
+ VERSION = '3.10.10'
@@ -457,6 +457,7 @@ def main():
457
457
  '安全组',
458
458
  # '视频数据',
459
459
  # '聚合数据',
460
+ '数据引擎2'
460
461
  ]
461
462
  tables_list = op.get_table_in_databases(db_list=db_list, reset_id=False)
462
463
  op.deduplicate(
@@ -3995,6 +3995,7 @@ def main(days=150, months=3):
3995
3995
  "推广数据2",
3996
3996
  "推广数据_淘宝店",
3997
3997
  "推广数据_奥莱店",
3998
+ "推广数据_圣积天猫店",
3998
3999
  "爱库存2",
3999
4000
  "生意参谋3",
4000
4001
  "生意经3",
@@ -4003,6 +4004,7 @@ def main(days=150, months=3):
4003
4004
  '商品人群画像2',
4004
4005
  '市场数据3',
4005
4006
  '回传数据',
4007
+ '数据引擎2',
4006
4008
  ]
4007
4009
  # 使用 ThreadPoolExecutor 来并行运行
4008
4010
  # with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -1,5 +1,4 @@
1
1
  # -*- coding:utf-8 -*-
2
- import datetime
3
2
  import re
4
3
  import time
5
4
  from functools import wraps
@@ -7,11 +6,12 @@ import warnings
7
6
  import pymysql
8
7
  import os
9
8
  from mdbq.log import mylogger
10
- from typing import List, Dict, Optional, Any, Tuple, Set
9
+ from typing import List, Dict, Optional, Any, Tuple
11
10
  from dbutils.pooled_db import PooledDB
12
11
  import threading
13
12
  import concurrent.futures
14
13
  from collections import defaultdict
14
+ import sys
15
15
 
16
16
 
17
17
  warnings.filterwarnings('ignore')
@@ -24,7 +24,7 @@ logger = mylogger.MyLogger(
24
24
  max_log_size=50,
25
25
  backup_count=5,
26
26
  enable_async=False, # 是否启用异步日志
27
- sample_rate=1, # 采样50%的DEBUG/INFO日志
27
+ sample_rate=1, # 采样DEBUG/INFO日志, 0.5表示50%的日志会被采样
28
28
  sensitive_fields=[], # 敏感字段列表
29
29
  )
30
30
 
@@ -72,26 +72,28 @@ class MySQLDeduplicator:
72
72
  skip_system_dbs: bool = True,
73
73
  max_retries: int = 3,
74
74
  retry_interval: int = 5,
75
- pool_size: int = 5
76
- ):
75
+ pool_size: int = 5,
76
+ primary_key: str = 'id',
77
+ date_range: Optional[List[str]] = None,
78
+ recent_month: Optional[int] = None,
79
+ date_column: str = '日期',
80
+ exclude_columns: Optional[List[str]] = None
81
+ ) -> None:
77
82
  """
78
83
  初始化去重处理器
79
-
80
- :param username: 数据库用户名
81
- :param password: 数据库密码
82
- :param host: 数据库主机,默认为localhost
83
- :param port: 数据库端口,默认为3306
84
- :param charset: 字符集,默认为utf8mb4
85
- :param max_workers: 最大工作线程数,默认为1(单线程)
86
- :param batch_size: 批量处理大小,默认为1000
87
- :param skip_system_dbs: 是否跳过系统数据库,默认为True
88
- :param max_retries: 最大重试次数
89
- :param retry_interval: 重试间隔(秒)
90
- :param pool_size: 连接池大小
84
+ 新增参数:
85
+ :param date_range: 指定去重的日期区间 [start_date, end_date],格式'YYYY-MM-DD'
86
+ :param recent_month: 最近N个月的数据去重(与date_range互斥,优先生效)
87
+ :param date_column: 时间列名,默认为'日期'
88
+ :param exclude_columns: 去重时排除的列名列表,默认为['id', '更新时间']
91
89
  """
92
90
  # 连接池状态标志
93
91
  self._closed = False
94
-
92
+ logger.debug('初始化MySQLDeduplicator', {
93
+ 'host': host, 'port': port, 'user': username, 'charset': charset,
94
+ 'max_workers': max_workers, 'batch_size': batch_size, 'pool_size': pool_size,
95
+ 'exclude_columns': exclude_columns
96
+ })
95
97
  # 初始化连接池
96
98
  self.pool = PooledDB(
97
99
  creator=pymysql,
@@ -110,6 +112,34 @@ class MySQLDeduplicator:
110
112
  self.skip_system_dbs = skip_system_dbs
111
113
  self.max_retries = max_retries
112
114
  self.retry_interval = retry_interval
115
+ self.primary_key = primary_key
116
+
117
+ # 时间范围参数
118
+ self.date_range = date_range
119
+ self.recent_month = recent_month
120
+ self.date_column = date_column
121
+ self._dedup_start_date = None
122
+ self._dedup_end_date = None
123
+ # 不管 exclude_columns 是否传入, 'id' 一定会被排除
124
+ default_exclude = {'id'}
125
+ # exclude_columns 不传则排除: ['id', '更新时间']
126
+ if not exclude_columns:
127
+ self.exclude_columns = list(default_exclude | {'更新时间'})
128
+ else:
129
+ self.exclude_columns = list(set(exclude_columns) | default_exclude)
130
+ # 解析时间范围
131
+ if self.date_range and len(self.date_range) == 2:
132
+ self._dedup_start_date, self._dedup_end_date = self.date_range
133
+ elif self.recent_month:
134
+ from datetime import datetime, timedelta
135
+ today = datetime.today()
136
+ month = today.month - self.recent_month
137
+ year = today.year
138
+ while month <= 0:
139
+ month += 12
140
+ year -= 1
141
+ self._dedup_start_date = f"{year}-{month:02d}-01"
142
+ self._dedup_end_date = today.strftime("%Y-%m-%d")
113
143
 
114
144
  # 线程安全控制
115
145
  self._lock = threading.Lock()
@@ -118,27 +148,28 @@ class MySQLDeduplicator:
118
148
  # 系统数据库列表
119
149
  self.SYSTEM_DATABASES = {'information_schema', 'mysql', 'performance_schema', 'sys'}
120
150
 
121
- def _get_connection(self):
151
+ def _get_connection(self) -> pymysql.connections.Connection:
122
152
  """从连接池获取连接"""
123
153
  if self._closed:
154
+ logger.error('尝试获取连接但连接池已关闭')
124
155
  raise ConnectionError("连接池已关闭")
125
156
  try:
126
157
  conn = self.pool.connection()
127
158
  logger.debug("成功获取数据库连接")
128
159
  return conn
129
160
  except Exception as e:
130
- logger.error(f"获取数据库连接失败: {str(e)}")
161
+ logger.error(f"获取数据库连接失败: {str(e)}", {'error_type': type(e).__name__})
131
162
  raise ConnectionError(f"连接数据库失败: {str(e)}")
132
163
 
133
164
  @staticmethod
134
- def _retry_on_failure(func):
165
+ def _retry_on_failure(func: Any) -> Any:
135
166
  """重试装饰器"""
136
-
137
167
  @wraps(func)
138
168
  def wrapper(self, *args, **kwargs):
139
169
  last_exception = None
140
170
  for attempt in range(self.max_retries + 1):
141
171
  try:
172
+ logger.debug(f'调用{func.__name__},第{attempt+1}次尝试', {'args': args, 'kwargs': kwargs})
142
173
  return func(self, *args, **kwargs)
143
174
  except (pymysql.OperationalError, pymysql.InterfaceError) as e:
144
175
  last_exception = e
@@ -146,18 +177,17 @@ class MySQLDeduplicator:
146
177
  wait_time = self.retry_interval * (attempt + 1)
147
178
  logger.warning(
148
179
  f"数据库操作失败,准备重试 (尝试 {attempt + 1}/{self.max_retries})",
149
- {'error': str(e), 'wait_time': wait_time})
180
+ {'error': str(e), 'wait_time': wait_time, 'func': func.__name__})
150
181
  time.sleep(wait_time)
151
182
  continue
152
183
  except Exception as e:
153
184
  last_exception = e
154
- logger.error(f"操作失败: {str(e)}", {'error_type': type(e).__name__})
185
+ logger.error(f"操作失败: {str(e)}", {'error_type': type(e).__name__, 'func': func.__name__})
155
186
  break
156
-
157
187
  if last_exception:
188
+ logger.error('重试后依然失败', {'func': func.__name__, 'last_exception': str(last_exception)})
158
189
  raise last_exception
159
190
  raise Exception("未知错误")
160
-
161
191
  return wrapper
162
192
 
163
193
  @_retry_on_failure
@@ -187,7 +217,7 @@ class MySQLDeduplicator:
187
217
 
188
218
  @_retry_on_failure
189
219
  def _get_table_columns(self, database: str, table: str) -> List[str]:
190
- """获取表的列名(排除id列)"""
220
+ """获取表的列名(排除主键列)"""
191
221
  sql = """
192
222
  SELECT COLUMN_NAME
193
223
  FROM INFORMATION_SCHEMA.COLUMNS
@@ -199,7 +229,7 @@ class MySQLDeduplicator:
199
229
  with conn.cursor() as cursor:
200
230
  cursor.execute(sql, (database, table))
201
231
  return [row['COLUMN_NAME'] for row in cursor.fetchall()
202
- if row['COLUMN_NAME'].lower() != 'id']
232
+ if row['COLUMN_NAME'].lower() != self.primary_key.lower()]
203
233
 
204
234
  def _acquire_table_lock(self, database: str, table: str) -> bool:
205
235
  """获取表处理锁,防止并发处理同一张表"""
@@ -212,7 +242,7 @@ class MySQLDeduplicator:
212
242
  self._processing_tables.add(key)
213
243
  return True
214
244
 
215
- def _release_table_lock(self, database: str, table: str):
245
+ def _release_table_lock(self, database: str, table: str) -> None:
216
246
  """释放表处理锁"""
217
247
  key = f"{database}.{table}"
218
248
 
@@ -238,100 +268,111 @@ class MySQLDeduplicator:
238
268
  """
239
269
  if not self._acquire_table_lock(database, table):
240
270
  return (0, 0)
241
-
271
+ temp_table = None
242
272
  try:
243
- logger.info(f"开始处理表: {database}.{table}")
244
-
273
+ # 获取原始数据总量
274
+ with self._get_connection() as conn:
275
+ with conn.cursor() as cursor:
276
+ logger.debug('执行SQL', {'sql': f'SELECT COUNT(*) as cnt FROM `{database}`.`{table}`'})
277
+ cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`")
278
+ total_count_row = cursor.fetchone()
279
+ total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
280
+ logger.info('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name})
245
281
  # 获取实际列名
246
282
  all_columns = self._get_table_columns(database, table)
247
- if not all_columns:
248
- logger.warning(f"表 {database}.{table} 没有有效列(可能只有id列),跳过")
249
- return (0, 0)
250
-
251
- # 使用指定列或所有列
283
+ logger.debug('获取表列', {'库': database, '表': table, 'all_columns': all_columns})
284
+ # 检查是否需要按时间范围过滤
285
+ use_time_filter = False
286
+ time_col = self.date_column
287
+ all_columns_lower = [col.lower() for col in all_columns]
288
+ # 排除exclude_columns
289
+ exclude_columns_lower = [col.lower() for col in getattr(self, 'exclude_columns', [])]
290
+ # 统一列名小写做判断
252
291
  use_columns = columns or all_columns
253
- invalid_columns = set(use_columns) - set(all_columns)
254
-
292
+ use_columns = [col for col in use_columns if col.lower() in all_columns_lower and col.lower() not in exclude_columns_lower]
293
+ invalid_columns = set([col for col in (columns or []) if col.lower() not in all_columns_lower])
255
294
  if invalid_columns:
256
- logger.warning(
257
- f"表 {database}.{table} 中不存在以下列: {invalid_columns},使用有效列",
258
- {'invalid_columns': invalid_columns}
259
- )
260
- use_columns = [col for col in use_columns if col in all_columns]
261
-
295
+ logger.warning('不存在的列', {"库": database, "表": table, "不存在以下列": invalid_columns, 'func': sys._getframe().f_code.co_name})
262
296
  if not use_columns:
263
- logger.error(f"表 {database}.{table} 没有有效的去重列")
297
+ logger.error('没有有效的去重列', {"库": database, "": table})
264
298
  return (0, 0)
265
-
266
- # 构建去重SQL
299
+ # 统一用反引号包裹
267
300
  column_list = ', '.join([f'`{col}`' for col in use_columns])
268
- # temp_table = f"temp_{table}_{int(time.time())}"
269
- temp_table = f"temp_{table}_dedup_{os.getpid()}" # 使用进程ID构建临时表
270
- temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table) # 确保表名合法
271
-
272
- # 使用临时表方案处理去重,避免锁表问题
301
+ temp_table = f"temp_{table}_dedup_{os.getpid()}_{threading.get_ident()}"
302
+ temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)[:60]
303
+ pk = self.primary_key
304
+ # 主键判断也用小写
305
+ if pk.lower() not in all_columns_lower and pk != 'id':
306
+ logger.error('', {"不存在主键列": database, "表": table, "主键列不存在": pk})
307
+ return (0, 0)
308
+ # 找到实际主键名
309
+ pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
310
+ # 构造where条件
311
+ where_time = ''
312
+ if use_time_filter:
313
+ where_time = f"WHERE `{time_col}` >= '{self._dedup_start_date}' AND `{time_col}` <= '{self._dedup_end_date}'"
273
314
  create_temp_sql = f"""
274
315
  CREATE TABLE `{database}`.`{temp_table}` AS
275
- SELECT MIN(`id`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
316
+ SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
276
317
  FROM `{database}`.`{table}`
318
+ {where_time}
277
319
  GROUP BY {column_list}
278
320
  HAVING COUNT(*) > 1
279
321
  """
280
-
281
- delete_dup_sql = f"""
282
- DELETE FROM `{database}`.`{table}`
283
- WHERE `id` NOT IN (
284
- SELECT `min_id` FROM `{database}`.`{temp_table}`
285
- ) AND ({' OR '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
286
- """
287
-
288
322
  drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
289
-
290
323
  with self._get_connection() as conn:
291
324
  with conn.cursor() as cursor:
292
- # 创建临时表统计重复数据
325
+ logger.debug('创建临时表SQL', {'sql': create_temp_sql})
293
326
  cursor.execute(create_temp_sql)
327
+ logger.debug('统计临时表重复组SQL', {'sql': f'SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`'})
294
328
  cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`")
295
- dup_count = cursor.fetchone()['cnt']
296
-
329
+ dup_count_row = cursor.fetchone()
330
+ dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
297
331
  if dup_count == 0:
298
- logger.info(f"表 {database}.{table} 没有重复数据")
332
+ logger.info('没有重复数据', {"库": database, "": table, "数据量": total_count, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns})
333
+ logger.debug('删除临时表SQL', {'sql': drop_temp_sql})
299
334
  cursor.execute(drop_temp_sql)
300
335
  conn.commit()
301
336
  return (0, 0)
302
-
303
- logger.info(
304
- f"表 {database}.{table} 发现 {dup_count} 组重复数据",
305
- {'columns': use_columns}
306
- )
307
-
337
+ affected_rows = 0
308
338
  if not dry_run:
309
- # 执行实际删除
310
- cursor.execute(delete_dup_sql)
311
- affected_rows = cursor.rowcount
312
- conn.commit()
313
- logger.info(
314
- f"表 {database}.{table} 已删除 {affected_rows} 行重复数据",
315
- {'columns': use_columns}
316
- )
339
+ # 分批删除,避免锁表
340
+ while True:
341
+ delete_dup_sql = f"""
342
+ DELETE FROM `{database}`.`{table}`
343
+ WHERE `{pk_real}` NOT IN (
344
+ SELECT `min_id` FROM `{database}`.`{temp_table}`
345
+ ) {'AND' if use_time_filter else ''} {f'`{time_col}` >= \'{self._dedup_start_date}\' AND `{time_col}` <= \'{self._dedup_end_date}\'' if use_time_filter else ''}
346
+ AND ({' AND '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
347
+ LIMIT {self.batch_size}
348
+ """
349
+ logger.debug('执行删除重复数据SQL', {'sql': delete_dup_sql})
350
+ cursor.execute(delete_dup_sql)
351
+ batch_deleted = cursor.rowcount
352
+ affected_rows += batch_deleted
353
+ conn.commit()
354
+ if batch_deleted < self.batch_size:
355
+ break
356
+ logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns})
317
357
  else:
358
+ logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组数": dup_count, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None})
318
359
  affected_rows = 0
319
- logger.info(
320
- f"[模拟运行] 表 {database}.{table} 将删除 {dup_count} 组重复数据",
321
- {'columns': use_columns}
322
- )
323
-
324
- # 清理临时表
360
+ logger.debug('删除临时表SQL', {'sql': drop_temp_sql})
325
361
  cursor.execute(drop_temp_sql)
326
362
  conn.commit()
327
-
328
363
  return (dup_count, affected_rows)
329
-
330
364
  except Exception as e:
331
- logger.error(
332
- f"处理表 {database}.{table} 时出错: {str(e)}",
333
- {'error_type': type(e).__name__}
334
- )
365
+ logger.error('异常', {"库": database, "表": table, "异常": str(e), 'func': sys._getframe().f_code.co_name, 'traceback': repr(e)})
366
+ # 异常时也要清理临时表
367
+ if temp_table:
368
+ try:
369
+ with self._get_connection() as conn:
370
+ with conn.cursor() as cursor:
371
+ drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
372
+ cursor.execute(drop_temp_sql)
373
+ conn.commit()
374
+ except Exception as drop_e:
375
+ logger.error('异常时清理临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
335
376
  return (0, 0)
336
377
  finally:
337
378
  self._release_table_lock(database, table)
@@ -353,17 +394,15 @@ class MySQLDeduplicator:
353
394
  :return: (重复行数, 删除行数)
354
395
  """
355
396
  try:
356
- # 检查表是否存在
357
397
  if not self._check_table_exists(database, table):
358
- logger.warning(f"表 {database}.{table} 不存在,跳过")
398
+ logger.warning('表不存在', {"库": database, "": table, "warning": "跳过"})
359
399
  return (0, 0)
360
-
361
- return self._deduplicate_table(database, table, columns, dry_run)
400
+ logger.info('单表开始', {"库": database, "表": table, "参数": {"指定去重列": columns, "模拟运行": dry_run, '排除列': self.exclude_columns}})
401
+ result = self._deduplicate_table(database, table, columns, dry_run)
402
+ logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result})
403
+ return result
362
404
  except Exception as e:
363
- logger.error(
364
- f"处理表 {database}.{table} 时发生全局错误: {str(e)}",
365
- {'error_type': type(e).__name__}
366
- )
405
+ logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
367
406
  return (0, 0)
368
407
 
369
408
  def deduplicate_database(
@@ -385,49 +424,40 @@ class MySQLDeduplicator:
385
424
  :return: 字典 {表名: (重复行数, 删除行数)}
386
425
  """
387
426
  results = {}
388
-
389
427
  try:
390
- # 检查数据库是否存在
391
428
  if not self._check_database_exists(database):
392
- logger.warning(f"数据库 {database} 不存在,跳过")
429
+ logger.warning('数据库不存在', {"库": database})
393
430
  return results
394
-
395
- # 获取要处理的表
396
431
  target_tables = tables or self._get_tables(database)
432
+ logger.debug('获取目标表', {'库': database, 'tables': target_tables})
397
433
  if not target_tables:
398
- logger.info(f"数据库 {database} 中没有表,跳过")
434
+ logger.info('数据库中没有表', {"库": database, "操作": "跳过"})
399
435
  return results
400
-
401
- logger.info(
402
- f"开始处理数据库 {database} 中的 {len(target_tables)} 张表",
403
- {'tables': target_tables}
404
- )
405
-
436
+ logger.info('库统计', {"库": database, "表数量": len(target_tables), "表列表": target_tables})
406
437
  if parallel and self.max_workers > 1:
407
- # 并行处理
438
+ logger.debug('并行处理表', {'库': database, 'max_workers': self.max_workers})
439
+ # 使用线程池并行处理
408
440
  with concurrent.futures.ThreadPoolExecutor(
409
441
  max_workers=self.max_workers
410
442
  ) as executor:
411
443
  futures = {}
412
444
  for table in target_tables:
413
445
  columns = columns_map.get(table) if columns_map else None
446
+ logger.debug('提交表去重任务', {'库': database, '表': table, 'columns': columns})
414
447
  futures[executor.submit(
415
448
  self.deduplicate_table,
416
449
  database, table, columns, dry_run
417
450
  )] = table
418
-
419
451
  for future in concurrent.futures.as_completed(futures):
420
452
  table = futures[future]
421
453
  try:
422
454
  dup_count, affected_rows = future.result()
423
455
  results[table] = (dup_count, affected_rows)
424
456
  except Exception as e:
425
- logger.error(
426
- f"处理表 {database}.{table} 时出错: {str(e)}",
427
- {'error_type': type(e).__name__}
428
- )
457
+ logger.error('异常', {"库": database, "表": table, "error": str(e), 'traceback': repr(e)})
429
458
  results[table] = (0, 0)
430
459
  else:
460
+ logger.debug('串行处理表', {'库': database})
431
461
  # 串行处理
432
462
  for table in target_tables:
433
463
  columns = columns_map.get(table) if columns_map else None
@@ -435,20 +465,12 @@ class MySQLDeduplicator:
435
465
  database, table, columns, dry_run
436
466
  )
437
467
  results[table] = (dup_count, affected_rows)
438
-
439
- # 统计结果
440
468
  total_dup = sum(r[0] for r in results.values())
441
469
  total_del = sum(r[1] for r in results.values())
442
-
443
- logger.info(
444
- f"数据库 {database} 处理完成 - 共发现 {total_dup} 组重复数据,删除 {total_del} 行",
445
- {'results': results}
446
- )
447
-
470
+ logger.info('单库完成', {"库": database, "重复组数": total_dup, "总删除行数": total_del, "详细结果": results})
448
471
  return results
449
-
450
472
  except Exception as e:
451
- logger.error(f"处理数据库 {database} 时发生全局错误: {str(e)}", {'error_type': type(e).__name__})
473
+ logger.error('发生全局错误', {"库": database, 'func': sys._getframe().f_code.co_name, "error": str(e), 'traceback': repr(e)})
452
474
  return results
453
475
 
454
476
  def deduplicate_all(
@@ -470,18 +492,15 @@ class MySQLDeduplicator:
470
492
  :return: 嵌套字典 {数据库名: {表名: (重复行数, 删除行数)}}
471
493
  """
472
494
  all_results = defaultdict(dict)
473
-
474
495
  try:
475
- # 获取要处理的数据库
476
496
  target_dbs = databases or self._get_databases()
497
+ logger.debug('获取目标数据库', {'databases': target_dbs})
477
498
  if not target_dbs:
478
- logger.warning("没有可处理的数据库")
499
+ logger.warning('没有可处理的数据库')
479
500
  return all_results
480
-
481
- logger.info(f"开始处理 {len(target_dbs)} 个数据库", {'databases': target_dbs})
482
-
501
+ logger.info('全局开始', {"数据库数量": len(target_dbs), "数据库列表": target_dbs, "参数": {"模拟运行": dry_run, "并行处理": parallel, '排除列': self.exclude_columns}})
483
502
  if parallel and self.max_workers > 1:
484
- # 并行处理数据库
503
+ # 使用线程池并行处理多个数据库
485
504
  with concurrent.futures.ThreadPoolExecutor(
486
505
  max_workers=self.max_workers
487
506
  ) as executor:
@@ -493,14 +512,13 @@ class MySQLDeduplicator:
493
512
  self.deduplicate_database,
494
513
  db, tables, db_columns_map, dry_run, False
495
514
  )] = db
496
-
497
515
  for future in concurrent.futures.as_completed(futures):
498
516
  db = futures[future]
499
517
  try:
500
518
  db_results = future.result()
501
519
  all_results[db] = db_results
502
520
  except Exception as e:
503
- logger.error(f"处理数据库 {db} 时出错: {str(e)}", {'error_type': type(e).__name__})
521
+ logger.error('异常', {"库": db, "error": str(e), 'traceback': repr(e)})
504
522
  all_results[db] = {}
505
523
  else:
506
524
  # 串行处理数据库
@@ -511,8 +529,6 @@ class MySQLDeduplicator:
511
529
  db, tables, db_columns_map, dry_run, parallel
512
530
  )
513
531
  all_results[db] = db_results
514
-
515
- # 统计总体结果
516
532
  total_dup = sum(
517
533
  r[0] for db in all_results.values()
518
534
  for r in db.values()
@@ -521,16 +537,10 @@ class MySQLDeduplicator:
521
537
  r[1] for db in all_results.values()
522
538
  for r in db.values()
523
539
  )
524
-
525
- logger.info(
526
- f"所有数据库处理完成 - 共发现 {total_dup} 组重复数据,删除 {total_del} 行",
527
- {'total_results': all_results}
528
- )
529
-
540
+ logger.info('全局完成', {"总重复组数": total_dup, "总删除行数": total_del, "详细结果": dict(all_results)})
530
541
  return all_results
531
-
532
542
  except Exception as e:
533
- logger.error(f"全局处理时发生错误: {str(e)}", {'error_type': type(e).__name__})
543
+ logger.error('异常', {"error": str(e), 'traceback': repr(e)})
534
544
  return all_results
535
545
 
536
546
  @_retry_on_failure
@@ -557,42 +567,46 @@ class MySQLDeduplicator:
557
567
  cursor.execute(sql, (database, table))
558
568
  return bool(cursor.fetchone())
559
569
 
560
- def close(self):
570
+ def close(self) -> None:
561
571
  """关闭连接池"""
562
572
  try:
563
573
  if hasattr(self, 'pool') and self.pool and not self._closed:
564
574
  self.pool.close()
565
575
  self._closed = True
566
576
  logger.info("数据库连接池已关闭")
577
+ else:
578
+ logger.info('连接池已关闭或不存在')
567
579
  except Exception as e:
568
- logger.error(f"关闭连接池时出错: {str(e)}", {'error_type': type(e).__name__})
580
+ logger.error(f"关闭连接池时出错", {'error_type': type(e).__name__, 'error': str(e)})
569
581
 
570
- def __enter__(self):
582
+ def __enter__(self) -> 'MySQLDeduplicator':
571
583
  return self
572
584
 
573
- def __exit__(self, exc_type, exc_val, exc_tb):
585
+ def __exit__(self, exc_type: Optional[type], exc_val: Optional[BaseException], exc_tb: Optional[Any]) -> None:
574
586
  self.close()
575
587
 
576
588
 
577
589
  def main():
578
590
  deduplicator = MySQLDeduplicator(
579
591
  username='root',
580
- password='188988yang188',
592
+ password='pwd',
581
593
  host='localhost',
582
594
  port=3306
583
595
  )
584
596
 
585
597
  # 全库去重(单线程)
586
- deduplicator.deduplicate_all()
598
+ deduplicator.deduplicate_all(dry_run=False, parallel=False)
587
599
 
588
600
  # # 指定数据库去重(多线程)
589
- # deduplicator.deduplicate_database('my_db', parallel=True)
601
+ # logger.info('调用deduplicate_database')
602
+ # deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True)
590
603
 
591
604
  # # 指定表去重(使用特定列)
592
- # deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
605
+ # deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False)
593
606
 
594
607
  # 关闭连接
595
608
  deduplicator.close()
596
609
 
597
610
  if __name__ == '__main__':
598
- main()
611
+ # main()
612
+ pass