mdbq 3.11.10__py3-none-any.whl → 3.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.11.10'
1
+ VERSION = '3.12.0'
@@ -14,6 +14,7 @@ from collections import defaultdict
14
14
  import sys
15
15
  from datetime import datetime
16
16
  import uuid
17
+ from contextlib import contextmanager
17
18
 
18
19
 
19
20
  warnings.filterwarnings('ignore')
@@ -34,32 +35,6 @@ logger = mylogger.MyLogger(
34
35
  class MySQLDeduplicator:
35
36
  """
36
37
  MySQL数据去重
37
-
38
- 功能:
39
- 1. 自动检测并删除MySQL数据库中的重复数据
40
- 2. 支持全库扫描或指定表理
41
- 3. 支持多线程/多进程安全处理
42
- 4. 完善的错误处理和日志记录
43
-
44
- 使用示例:
45
- deduplicator = MySQLDeduplicator(
46
- username='root',
47
- password='password',
48
- host='localhost',
49
- port=3306
50
- )
51
-
52
- # 全库去重
53
- deduplicator.deduplicate_all()
54
-
55
- # 指定数据库去重(多线程)
56
- deduplicator.deduplicate_database('my_db', parallel=True)
57
-
58
- # 指定表去重(使用特定列)
59
- deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
60
-
61
- # 关闭连接
62
- deduplicator.close()
63
38
  """
64
39
 
65
40
  def __init__(
@@ -69,12 +44,12 @@ class MySQLDeduplicator:
69
44
  host: str = 'localhost',
70
45
  port: int = 3306,
71
46
  charset: str = 'utf8mb4',
72
- max_workers: int = 1,
47
+ max_workers: int = 2,
73
48
  batch_size: int = 1000,
74
49
  skip_system_dbs: bool = True,
75
50
  max_retries: int = 3,
76
- retry_interval: int = 5,
77
- pool_size: int = 5,
51
+ retry_waiting_time: int = 5,
52
+ pool_size: int = 10,
78
53
  primary_key: str = 'id',
79
54
  date_range: Optional[List[str]] = None,
80
55
  recent_month: Optional[int] = None,
@@ -113,35 +88,40 @@ class MySQLDeduplicator:
113
88
  cursorclass=pymysql.cursors.DictCursor
114
89
  )
115
90
 
91
+ # 并发模式要将 pool_size 加大
92
+ MAX_POOL_SIZE = 200
93
+ MAX_WORKERS = 4
94
+ if max_workers > MAX_WORKERS:
95
+ logger.warning(f"max_workers({max_workers}) 超过最大建议值({MAX_WORKERS}),自动将 max_workers 调整为 {MAX_WORKERS}")
96
+ max_workers = MAX_WORKERS
97
+ expected_threads = max_workers * 10
98
+ if pool_size < expected_threads:
99
+ logger.warning(f"pool_size({pool_size}) < max_workers({max_workers}) * 10,自动将 pool_size 调整为 {expected_threads}")
100
+ pool_size = expected_threads
101
+ if pool_size > MAX_POOL_SIZE:
102
+ logger.warning(f"pool_size({pool_size}) 超过最大建议值({MAX_POOL_SIZE}),自动将 pool_size 调整为 {MAX_POOL_SIZE}")
103
+ pool_size = MAX_POOL_SIZE
104
+ self.max_workers = max_workers
105
+ self.pool_size = pool_size
106
+
116
107
  # 配置参数
117
- self.max_workers = min(max(1, max_workers), pool_size) # 限制最大线程数,不能超过连接池
118
108
  self.batch_size = batch_size
119
109
  self.skip_system_dbs = skip_system_dbs
120
110
  self.max_retries = max_retries
121
- self.retry_interval = retry_interval
111
+ self.retry_waiting_time = retry_waiting_time
122
112
  self.primary_key = primary_key
123
113
 
124
114
  # 时间范围参数
125
- self.date_range = date_range
126
- self.recent_month = recent_month
127
115
  self.date_column = date_column
128
116
  self._dedup_start_date = None
129
117
  self._dedup_end_date = None
130
- # 不管 exclude_columns 是否传入, 'id' 一定会被排除
131
- default_exclude = {'id'}
132
- # exclude_columns 不传则排除: ['id', '更新时间']
133
- if not exclude_columns:
134
- self.exclude_columns = list(default_exclude | {'更新时间'})
135
- else:
136
- self.exclude_columns = list(set(exclude_columns) | default_exclude)
137
- # 解析时间范围并智能校正date_range
138
- if self.date_range and len(self.date_range) == 2:
118
+ if date_range and len(date_range) == 2:
139
119
  try:
140
- start, end = self.date_range
120
+ start, end = date_range
141
121
  start_dt = datetime.strptime(start, "%Y-%m-%d")
142
122
  end_dt = datetime.strptime(end, "%Y-%m-%d")
143
123
  if start_dt > end_dt:
144
- logger.warning(
124
+ logger.debug(
145
125
  "date_range顺序不正确,自动交换开始和结束日期。",
146
126
  {"start": start, "end": end}
147
127
  )
@@ -151,30 +131,36 @@ class MySQLDeduplicator:
151
131
  except Exception as e:
152
132
  logger.error(
153
133
  "date_range参数格式错误,应为['YYYY-MM-DD', 'YYYY-MM-DD'],已忽略时间范围。",
154
- {"date_range": self.date_range, "error": str(e)}
134
+ {"date_range": date_range, "error": str(e)}
155
135
  )
156
136
  self._dedup_start_date = None
157
137
  self._dedup_end_date = None
158
- elif self.recent_month:
138
+ elif recent_month:
159
139
  today = datetime.today()
160
- month = today.month - self.recent_month
140
+ month = today.month - recent_month
161
141
  year = today.year
162
142
  while month <= 0:
163
143
  month += 12
164
144
  year -= 1
165
145
  self._dedup_start_date = f"{year}-{month:02d}-01"
166
146
  self._dedup_end_date = today.strftime("%Y-%m-%d")
147
+
148
+ if self._dedup_start_date and self._dedup_end_date:
149
+ logger.info('去重日期范围', {'开始': self._dedup_start_date, '结束': self._dedup_end_date})
150
+
151
+ # 排除列处理,直接合并去重
152
+ self.exclude_columns = list(set((exclude_columns or []) + ['id', '更新时间']))
167
153
 
168
154
  # 线程安全控制
169
155
  self._lock = threading.Lock()
170
156
  self._processing_tables = set() # 正在处理的表集合
171
157
 
172
158
  # 系统数据库列表
173
- self.SYSTEM_DATABASES = {'information_schema', 'mysql', 'performance_schema', 'sys'}
159
+ self.SYSTEM_DATABASES = {'information_schema', 'mysql', 'performance_schema', 'sys', 'sakila'}
174
160
 
175
161
  # 排除数据库和表的逻辑
176
- self.exclude_databases = set([db.lower() for db in exclude_databases]) if exclude_databases else set()
177
- self.exclude_tables = {k.lower(): set([t.lower() for t in v]) for k, v in (exclude_tables or {}).items()}
162
+ self.exclude_databases = set(db.lower() for db in (exclude_databases or []))
163
+ self.exclude_tables = {k.lower(): set(t.lower() for t in v) for k, v in (exclude_tables or {}).items()}
178
164
 
179
165
  self.duplicate_keep_mode = duplicate_keep_mode if duplicate_keep_mode in ('keep_one', 'remove_all') else 'keep_one'
180
166
 
@@ -197,6 +183,14 @@ class MySQLDeduplicator:
197
183
  logger.error(f"获取数据库连接失败: {str(e)}", {'error_type': type(e).__name__})
198
184
  raise ConnectionError(f"连接数据库失败: {str(e)}")
199
185
 
186
+ @contextmanager
187
+ def _conn_ctx(self):
188
+ conn = self._get_connection()
189
+ try:
190
+ yield conn
191
+ finally:
192
+ conn.close()
193
+
200
194
  @staticmethod
201
195
  def _retry_on_failure(func: Any) -> Any:
202
196
  """
@@ -220,7 +214,7 @@ class MySQLDeduplicator:
220
214
  except (pymysql.OperationalError, pymysql.InterfaceError) as e:
221
215
  last_exception = e
222
216
  if attempt < self.max_retries:
223
- wait_time = self.retry_interval * (attempt + 1)
217
+ wait_time = self.retry_waiting_time * (attempt + 1)
224
218
  logger.warning(
225
219
  f"数据库操作失败,准备重试 (尝试 {attempt + 1}/{self.max_retries})",
226
220
  {'error': str(e), 'wait_time': wait_time, 'func': func.__name__})
@@ -236,16 +230,15 @@ class MySQLDeduplicator:
236
230
  raise Exception("未知错误")
237
231
  return wrapper
238
232
 
239
- @_retry_on_failure
240
233
  def _get_databases(self) -> List[str]:
241
234
  """
242
- 获取所有非系统数据库列表,排除exclude_databases。
235
+ 获取所有非系统数据库列表,排除 exclude_databases。
243
236
 
244
237
  Returns:
245
238
  List[str]: 数据库名列表。
246
239
  """
247
240
  sql = "SHOW DATABASES"
248
- with self._get_connection() as conn:
241
+ with self._conn_ctx() as conn:
249
242
  with conn.cursor() as cursor:
250
243
  cursor.execute(sql)
251
244
  all_dbs = [row['Database'] for row in cursor.fetchall()]
@@ -253,10 +246,9 @@ class MySQLDeduplicator:
253
246
  filtered = [db for db in all_dbs if db.lower() not in self.SYSTEM_DATABASES and db.lower() not in self.exclude_databases] if self.skip_system_dbs else [db for db in all_dbs if db.lower() not in self.exclude_databases]
254
247
  return filtered
255
248
 
256
- @_retry_on_failure
257
249
  def _get_tables(self, database: str) -> List[str]:
258
250
  """
259
- 获取指定数据库的所有表名。
251
+ 获取指定数据库的所有表名(排除 temp_ 前缀的临时表)。
260
252
 
261
253
  Args:
262
254
  database (str): 数据库名。
@@ -264,15 +256,12 @@ class MySQLDeduplicator:
264
256
  List[str]: 表名列表。
265
257
  """
266
258
  sql = "SHOW TABLES"
267
-
268
- with self._get_connection() as conn:
259
+ with self._conn_ctx() as conn:
269
260
  with conn.cursor() as cursor:
270
261
  cursor.execute(f"USE `{database}`")
271
262
  cursor.execute(sql)
272
- # 严格过滤所有以'temp_'为前缀的表名(如temp_xxx、temp_xxx_dedup_...、temp_xxx_reorderid_...等)
273
263
  return [row[f'Tables_in_{database}'] for row in cursor.fetchall() if not re.match(r'^temp_.*', row[f'Tables_in_{database}'])]
274
264
 
275
- @_retry_on_failure
276
265
  def _get_table_columns(self, database: str, table: str) -> List[str]:
277
266
  """
278
267
  获取指定表的所有列名(排除主键列)。
@@ -289,56 +278,22 @@ class MySQLDeduplicator:
289
278
  WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
290
279
  ORDER BY ORDINAL_POSITION
291
280
  """
292
-
293
- with self._get_connection() as conn:
281
+ with self._conn_ctx() as conn:
294
282
  with conn.cursor() as cursor:
295
283
  cursor.execute(sql, (database, table))
296
284
  return [row['COLUMN_NAME'] for row in cursor.fetchall()
297
285
  if row['COLUMN_NAME'].lower() != self.primary_key.lower()]
298
286
 
299
- def _acquire_table_lock(self, database: str, table: str) -> bool:
300
- """
301
- 获取表处理锁,防止并发处理同一张表。
302
-
303
- Args:
304
- database (str): 数据库名。
305
- table (str): 表名。
306
- Returns:
307
- bool: 是否成功获取锁。
308
- """
309
- key = f"{database}.{table}"
310
-
311
- with self._lock:
312
- if key in self._processing_tables:
313
- logger.debug(f"表 {key} 正在被其他线程处理,跳过")
314
- return False
315
- self._processing_tables.add(key)
316
- return True
317
-
318
- def _release_table_lock(self, database: str, table: str) -> None:
319
- """
320
- 释放表处理锁。
321
-
322
- Args:
323
- database (str): 数据库名。
324
- table (str): 表名。
325
- """
326
- key = f"{database}.{table}"
327
-
328
- with self._lock:
329
- if key in self._processing_tables:
330
- self._processing_tables.remove(key)
331
-
332
- @_retry_on_failure
333
287
  def _ensure_index(self, database: str, table: str, date_column: str) -> None:
334
288
  """
335
- 检查并为date_column自动创建索引(如果未存在)。
289
+ 检查并为 date_column 自动创建索引(如果未存在)。
290
+
336
291
  Args:
337
292
  database (str): 数据库名。
338
293
  table (str): 表名。
339
294
  date_column (str): 需要检查的日期列名。
340
295
  """
341
- with self._get_connection() as conn:
296
+ with self._conn_ctx() as conn:
342
297
  with conn.cursor() as cursor:
343
298
  # 检查索引是否已存在
344
299
  cursor.execute(
@@ -356,11 +311,9 @@ class MySQLDeduplicator:
356
311
  try:
357
312
  cursor.execute(f"CREATE INDEX `{safe_index_name}` ON `{database}`.`{table}` (`{date_column}`)")
358
313
  conn.commit()
359
- logger.info('已自动为date_column创建索引', {"库": database, "表": table, "date_column": date_column, "索引名": safe_index_name})
314
+ logger.debug('已自动为date_column创建索引', {"库": database, "表": table, "date_column": date_column, "索引名": safe_index_name})
360
315
  except Exception as e:
361
316
  logger.error('自动创建date_column索引失败', {"库": database, "表": table, "date_column": date_column, "异常": str(e)})
362
- else:
363
- logger.debug('date_column已存在索引', {"库": database, "表": table, "date_column": date_column})
364
317
 
365
318
  def _row_generator(self, database, table, select_cols, select_where, batch_size=10000):
366
319
  """
@@ -377,7 +330,7 @@ class MySQLDeduplicator:
377
330
  offset = 0
378
331
  while True:
379
332
  sql = f"SELECT {select_cols} FROM `{database}`.`{table}` {select_where} LIMIT {batch_size} OFFSET {offset}"
380
- with self._get_connection() as conn:
333
+ with self._conn_ctx() as conn:
381
334
  with conn.cursor() as cursor:
382
335
  cursor.execute(sql)
383
336
  rows = cursor.fetchall()
@@ -388,85 +341,184 @@ class MySQLDeduplicator:
388
341
  if len(rows) < batch_size:
389
342
  break
390
343
  offset += batch_size
391
-
392
- def _get_all_dates(self, database: str, table: str, date_column: str) -> list:
344
+
345
+ def _get_all_dates(self, database: str, table: str, date_column: str) -> List[str]:
393
346
  """
394
347
  获取表中所有不同的日期分区(按天)。
348
+
395
349
  Args:
396
350
  database (str): 数据库名。
397
351
  table (str): 表名。
398
352
  date_column (str): 日期列名。
399
353
  Returns:
400
- List: 所有不同的日期(字符串)。
354
+ List[str]: 所有不同的日期(字符串)。
401
355
  """
402
356
  sql = f"SELECT DISTINCT `{date_column}` FROM `{database}`.`{table}` ORDER BY `{date_column}` ASC"
403
- with self._get_connection() as conn:
357
+ with self._conn_ctx() as conn:
404
358
  with conn.cursor() as cursor:
405
359
  cursor.execute(sql)
406
360
  return [row[date_column] for row in cursor.fetchall() if row[date_column] is not None]
407
361
 
408
362
  def _deduplicate_table(
409
- self,
410
- database: str,
411
- table: str,
412
- columns: Optional[List[str]] = None,
413
- dry_run: bool = False,
414
- use_python_dedup: bool = False,
415
- dedup_start_date: Optional[str] = None,
416
- dedup_end_date: Optional[str] = None,
417
- lock_table: bool = True
363
+ self,
364
+ database: str,
365
+ table: str,
366
+ columns: Optional[List[str]] = None,
367
+ dry_run: bool = False,
368
+ use_python_dedup: bool = False,
369
+ date_val: Optional[str] = None,
370
+ lock_table: bool = True
418
371
  ) -> Tuple[int, int]:
419
372
  """
420
- 执行单表去重。
421
- 支持按天分批处理(如果表包含date_column),否则全表去重。
422
- 如果date_column在exclude_columns中,直接跳过该表。
423
- 优化:分批删除时用主键、避免重复建/删临时表、并发处理每天。
373
+ 执行单表单天去重。只处理 date_val 这一天的数据(如果有 date_column),否则全表。
374
+
375
+ Args:
376
+ database (str): 数据库名。
377
+ table (str): 表名。
378
+ columns (Optional[List[str]]): 指定去重列。
379
+ dry_run (bool): 是否为模拟运行。
380
+ use_python_dedup (bool): 是否用 Python 方式去重。
381
+ date_val (Optional[str]): 指定处理的日期(如有 date_column)。
382
+ lock_table (bool): 是否加表级锁。
383
+ Returns:
384
+ Tuple[int, int]: (重复组数, 实际删除行数)
424
385
  """
425
386
  if lock_table and not self._acquire_table_lock(database, table):
426
387
  return (0, 0)
427
388
  temp_table = None
428
389
  try:
429
- # 获取实际列名
430
390
  all_columns = self._get_table_columns(database, table)
431
391
  all_columns_lower = [col.lower() for col in all_columns]
432
392
  exclude_columns_lower = [col.lower() for col in getattr(self, 'exclude_columns', [])]
433
393
  time_col = self.date_column
434
394
  time_col_lower = time_col.lower() if time_col else None
435
- # 1. 跳过date_column在exclude_columns的情况
436
395
  if time_col_lower and time_col_lower in exclude_columns_lower:
437
396
  logger.warning('date_column在exclude_columns中,跳过该表', {"库": database, "表": table, "date_column": time_col, "exclude_columns": self.exclude_columns})
438
397
  return (0, 0)
439
- # 2. 判断表是否包含date_column
440
398
  has_time_col = time_col_lower in all_columns_lower if time_col_lower else False
441
- # 如果包含date_column,自动检查并创建索引
442
- if has_time_col and dedup_start_date is None and dedup_end_date is None:
399
+
400
+ # 只要有date_column,始终分天处理(本函数只处理一天)
401
+ if has_time_col and date_val is not None:
443
402
  self._ensure_index(database, table, time_col)
444
- # 按天分区多线程处理
445
- all_dates = self._get_all_dates(database, table, time_col)
446
- total_dup = 0
447
- total_del = 0
448
- def process_date(date_val):
449
- try:
450
- logger.debug('按天分区去重', {"库": database, "表": table, "日期": date_val})
451
- dup_count, affected_rows = self._deduplicate_table(
452
- database, table, columns, dry_run, use_python_dedup,
453
- dedup_start_date=date_val, dedup_end_date=date_val,
454
- lock_table=False
455
- )
456
- return (dup_count, affected_rows, date_val, None)
457
- except Exception as e:
458
- logger.error('分区去重异常', {"库": database, "表": table, "日期": date_val, "异常": str(e), "func": sys._getframe().f_code.co_name})
459
- return (0, 0, date_val, str(e))
460
- with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
461
- future_to_date = {executor.submit(process_date, date_val): date_val for date_val in all_dates}
462
- for future in concurrent.futures.as_completed(future_to_date):
463
- dup_count, affected_rows, date_val, err = future.result()
464
- if err:
465
- logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
466
- total_dup += dup_count
467
- total_del += affected_rows
468
- return (total_dup, total_del)
469
- # 获取去重列
403
+ # 获取去重列
404
+ use_columns = columns or all_columns
405
+ use_columns = [col for col in use_columns if col.lower() in all_columns_lower and col.lower() not in exclude_columns_lower]
406
+ invalid_columns = set([col for col in (columns or []) if col.lower() not in all_columns_lower])
407
+ if invalid_columns:
408
+ logger.warning('不存在的列', {"库": database, "表": table, "不存在以下列": invalid_columns, 'func': sys._getframe().f_code.co_name})
409
+ if not use_columns:
410
+ logger.error('没有有效的去重列', {"库": database, "表": table, "func": sys._getframe().f_code.co_name})
411
+ return (0, 0)
412
+ pk = self.primary_key
413
+ pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
414
+ where_sql = f"t.`{time_col}` = '{date_val}'"
415
+ # 获取原始数据总量(只统计当天数据)
416
+ with self._conn_ctx() as conn:
417
+ with conn.cursor() as cursor:
418
+ count_where = f"WHERE `{time_col}` = '{date_val}'"
419
+ count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}` {count_where}"
420
+ logger.debug('执行SQL', {'sql': count_sql})
421
+ cursor.execute(count_sql)
422
+ total_count_row = cursor.fetchone()
423
+ total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
424
+ logger.debug('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name, "数据日期": date_val})
425
+ column_list = ', '.join([f'`{col}`' for col in use_columns])
426
+
427
+ # 用Python查找重复
428
+ if use_python_dedup:
429
+ select_cols = f'`{pk_real}`,' + ','.join([f'`{col}`' for col in use_columns])
430
+ select_where = f"WHERE `{time_col}` = '{date_val}'"
431
+ grouped = defaultdict(list)
432
+ for row in self._row_generator(database, table, select_cols, select_where, self.batch_size):
433
+ key = tuple(row[col] for col in use_columns)
434
+ grouped[key].append(row[pk_real])
435
+ dup_count = 0
436
+ del_ids = []
437
+ for ids in grouped.values():
438
+ if len(ids) > 1:
439
+ dup_count += 1
440
+ del_ids.extend(ids[1:])
441
+ affected_rows = 0
442
+ if not dry_run and del_ids:
443
+ with self._conn_ctx() as conn:
444
+ with conn.cursor() as cursor:
445
+ for i in range(0, len(del_ids), self.batch_size):
446
+ batch_ids = del_ids[i:i+self.batch_size]
447
+ del_ids_str = ','.join([str(i) for i in batch_ids])
448
+ delete_sql = f"DELETE FROM `{database}`.`{table}` WHERE `{pk_real}` IN ({del_ids_str})"
449
+ cursor.execute(delete_sql)
450
+ batch_deleted = cursor.rowcount
451
+ affected_rows += batch_deleted
452
+ conn.commit()
453
+ logger.debug('去重完成', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "Python", "数据处理": self.duplicate_keep_mode, "数据日期": date_val})
454
+ return (dup_count, affected_rows)
455
+ # SQL方式查找重复
456
+ temp_table = self._make_temp_table_name(table)
457
+ drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
458
+ create_temp_where = f"WHERE `{time_col}` = '{date_val}'"
459
+ create_temp_sql = f"""
460
+ CREATE TABLE `{database}`.`{temp_table}` AS
461
+ SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
462
+ FROM `{database}`.`{table}`
463
+ {create_temp_where}
464
+ GROUP BY {column_list}
465
+ HAVING COUNT(*) > 1
466
+ """
467
+ with self._conn_ctx() as conn:
468
+ with conn.cursor() as cursor:
469
+ logger.debug('创建临时表SQL', {'sql': create_temp_sql})
470
+ cursor.execute(create_temp_sql)
471
+ cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`")
472
+ dup_count_row = cursor.fetchone()
473
+ dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
474
+ if dup_count == 0:
475
+ logger.debug('没有重复数据', {"库": database, "表": table, "数据量": total_count, "数据日期": date_val})
476
+ cursor.execute(drop_temp_sql)
477
+ conn.commit()
478
+ return (0, 0)
479
+ affected_rows = 0
480
+ if not dry_run:
481
+ while True:
482
+ where_clauses = []
483
+ if self.duplicate_keep_mode == 'keep_one':
484
+ where_clauses.append(f"t.`{pk_real}` <> tmp.`min_id`")
485
+ if where_sql.strip():
486
+ where_clauses.append(where_sql.strip())
487
+ where_full = "WHERE " + " AND ".join(where_clauses) if where_clauses else ""
488
+ find_dup_ids_sql = f"""
489
+ SELECT t.`{pk_real}` as del_id
490
+ FROM `{database}`.`{table}` t
491
+ JOIN `{database}`.`{temp_table}` tmp
492
+ ON {' AND '.join([f't.`{col}` <=> tmp.`{col}`' for col in use_columns])}
493
+ {where_full}
494
+ LIMIT {self.batch_size}
495
+ """
496
+ logger.debug('查找待删除重复id SQL', {'sql': find_dup_ids_sql})
497
+ cursor.execute(find_dup_ids_sql)
498
+ del_ids = [row['del_id'] for row in cursor.fetchall()]
499
+ if not del_ids:
500
+ break
501
+ del_ids_str = ','.join([str(i) for i in del_ids])
502
+ delete_sql = f"DELETE FROM `{database}`.`{table}` WHERE `{pk_real}` IN ({del_ids_str})"
503
+ logger.debug('按id批量删除SQL', {'sql': delete_sql, 'ids': del_ids})
504
+ cursor.execute(delete_sql)
505
+ batch_deleted = cursor.rowcount
506
+ affected_rows += batch_deleted
507
+ conn.commit()
508
+ if batch_deleted == 0:
509
+ logger.warning('检测到未能删除任何数据,强制跳出循环,防止假死', {"库": database, "表": table})
510
+ break
511
+ if batch_deleted < self.batch_size:
512
+ break
513
+ logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "SQL", "数据处理": self.duplicate_keep_mode, "数据日期": date_val})
514
+ else:
515
+ logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组": dup_count})
516
+ affected_rows = 0
517
+ cursor.execute(drop_temp_sql)
518
+ conn.commit()
519
+ return (dup_count, affected_rows)
520
+ # 没有date_column,处理全表
521
+ # ...existing code for full-table deduplication (as before, but without recursion)...
470
522
  use_columns = columns or all_columns
471
523
  use_columns = [col for col in use_columns if col.lower() in all_columns_lower and col.lower() not in exclude_columns_lower]
472
524
  invalid_columns = set([col for col in (columns or []) if col.lower() not in all_columns_lower])
@@ -477,70 +529,53 @@ class MySQLDeduplicator:
477
529
  return (0, 0)
478
530
  pk = self.primary_key
479
531
  pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
480
- # 判断是否需要加日期区间条件
481
- where_sql = ''
482
- if has_time_col and dedup_start_date and dedup_end_date:
483
- where_sql = f"t.`{time_col}` >= '{dedup_start_date}' AND t.`{time_col}` <= '{dedup_end_date}'"
484
- # 获取原始数据总量(只统计区间内数据)
485
- with self._get_connection() as conn:
532
+ # 获取原始数据总量
533
+ with self._conn_ctx() as conn:
486
534
  with conn.cursor() as cursor:
487
- count_where = f"WHERE `{time_col}` >= '{dedup_start_date}' AND `{time_col}` <= '{dedup_end_date}'" if has_time_col and dedup_start_date and dedup_end_date else ''
488
- count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}` {count_where}"
535
+ count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`"
489
536
  logger.debug('执行SQL', {'sql': count_sql})
490
537
  cursor.execute(count_sql)
491
538
  total_count_row = cursor.fetchone()
492
539
  total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
493
- logger.info('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name, "数据日期": dedup_end_date})
540
+ logger.debug('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name})
494
541
  column_list = ', '.join([f'`{col}`' for col in use_columns])
495
-
496
- # 用Python查找重复
497
542
  if use_python_dedup:
498
- # 1. 拉取所有数据(生成器分批拉取)
499
543
  select_cols = f'`{pk_real}`,' + ','.join([f'`{col}`' for col in use_columns])
500
- select_where = f"WHERE `{time_col}` >= '{dedup_start_date}' AND `{time_col}` <= '{dedup_end_date}'" if has_time_col and dedup_start_date and dedup_end_date else ''
501
- select_sql = f"SELECT {select_cols} FROM `{database}`.`{table}` {select_where}"
502
- logger.debug('用Python查找重复,拉取数据SQL', {'sql': select_sql})
503
- # 用生成器分批拉取
544
+ select_where = ''
504
545
  grouped = defaultdict(list)
505
546
  for row in self._row_generator(database, table, select_cols, select_where, self.batch_size):
506
547
  key = tuple(row[col] for col in use_columns)
507
548
  grouped[key].append(row[pk_real])
508
- # 2. 统计重复组和待删除id
509
549
  dup_count = 0
510
550
  del_ids = []
511
551
  for ids in grouped.values():
512
552
  if len(ids) > 1:
513
553
  dup_count += 1
514
- del_ids.extend(ids[1:]) # 只保留第一个
554
+ del_ids.extend(ids[1:])
515
555
  affected_rows = 0
516
556
  if not dry_run and del_ids:
517
- with self._get_connection() as conn:
557
+ with self._conn_ctx() as conn:
518
558
  with conn.cursor() as cursor:
519
559
  for i in range(0, len(del_ids), self.batch_size):
520
- batch = del_ids[i:i+self.batch_size]
521
- del_ids_str = ','.join([str(i) for i in batch])
560
+ batch_ids = del_ids[i:i+self.batch_size]
561
+ del_ids_str = ','.join([str(i) for i in batch_ids])
522
562
  delete_sql = f"DELETE FROM `{database}`.`{table}` WHERE `{pk_real}` IN ({del_ids_str})"
523
- logger.debug('用Python分批删除SQL', {'sql': delete_sql, 'ids': batch})
524
563
  cursor.execute(delete_sql)
525
564
  batch_deleted = cursor.rowcount
526
565
  affected_rows += batch_deleted
527
566
  conn.commit()
528
- logger.info('去重完成', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "Python", "数据处理": self.duplicate_keep_mode, "数据日期": dedup_end_date})
567
+ logger.debug('去重完成', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "Python", "数据处理": self.duplicate_keep_mode})
529
568
  return (dup_count, affected_rows)
530
- # SQL方式查找重复
531
569
  temp_table = self._make_temp_table_name(table)
532
570
  drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
533
- # 创建临时表时加where条件
534
- create_temp_where = f"WHERE `{time_col}` >= '{dedup_start_date}' AND `{time_col}` <= '{dedup_end_date}'" if has_time_col and dedup_start_date and dedup_end_date else ''
535
571
  create_temp_sql = f"""
536
572
  CREATE TABLE `{database}`.`{temp_table}` AS
537
573
  SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
538
574
  FROM `{database}`.`{table}`
539
- {create_temp_where}
540
575
  GROUP BY {column_list}
541
576
  HAVING COUNT(*) > 1
542
577
  """
543
- with self._get_connection() as conn:
578
+ with self._conn_ctx() as conn:
544
579
  with conn.cursor() as cursor:
545
580
  logger.debug('创建临时表SQL', {'sql': create_temp_sql})
546
581
  cursor.execute(create_temp_sql)
@@ -548,7 +583,7 @@ class MySQLDeduplicator:
548
583
  dup_count_row = cursor.fetchone()
549
584
  dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
550
585
  if dup_count == 0:
551
- logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count, "数据日期": dedup_end_date})
586
+ logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count})
552
587
  cursor.execute(drop_temp_sql)
553
588
  conn.commit()
554
589
  return (0, 0)
@@ -558,8 +593,6 @@ class MySQLDeduplicator:
558
593
  where_clauses = []
559
594
  if self.duplicate_keep_mode == 'keep_one':
560
595
  where_clauses.append(f"t.`{pk_real}` <> tmp.`min_id`")
561
- if where_sql.strip():
562
- where_clauses.append(where_sql.strip())
563
596
  where_full = "WHERE " + " AND ".join(where_clauses) if where_clauses else ""
564
597
  find_dup_ids_sql = f"""
565
598
  SELECT t.`{pk_real}` as del_id
@@ -586,7 +619,7 @@ class MySQLDeduplicator:
586
619
  break
587
620
  if batch_deleted < self.batch_size:
588
621
  break
589
- logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "SQL", "数据处理": self.duplicate_keep_mode, "数据日期": dedup_end_date})
622
+ logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "SQL", "数据处理": self.duplicate_keep_mode})
590
623
  else:
591
624
  logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组": dup_count})
592
625
  affected_rows = 0
@@ -595,10 +628,9 @@ class MySQLDeduplicator:
595
628
  return (dup_count, affected_rows)
596
629
  except Exception as e:
597
630
  logger.error('异常', {"库": database, "表": table, "异常": str(e), 'func': sys._getframe().f_code.co_name, 'traceback': repr(e)})
598
- # 异常时也要清理临时表
599
631
  if temp_table:
600
632
  try:
601
- with self._get_connection() as conn:
633
+ with self._conn_ctx() as conn:
602
634
  with conn.cursor() as cursor:
603
635
  drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
604
636
  cursor.execute(drop_temp_sql)
@@ -611,26 +643,26 @@ class MySQLDeduplicator:
611
643
  self._release_table_lock(database, table)
612
644
 
613
645
  def deduplicate_table(
614
- self,
615
- database: str,
616
- table: str,
617
- columns: Optional[List[str]] = None,
618
- dry_run: bool = False,
619
- reorder_id: bool = False,
620
- use_python_dedup: bool = True
646
+ self,
647
+ database: str,
648
+ table: str,
649
+ columns: Optional[List[str]] = None,
650
+ dry_run: bool = False,
651
+ reorder_id: bool = False,
652
+ use_python_dedup: bool = True
621
653
  ) -> Tuple[int, int]:
622
654
  """
623
- 对指定表进行去重。
655
+ 对指定表进行去重。始终按天分区(如有 date_column),否则全表。
624
656
 
625
657
  Args:
626
658
  database (str): 数据库名。
627
659
  table (str): 表名。
628
- columns (Optional[List[str]]): 用于去重的列名列表(为None时使用所有列)。
629
- dry_run (bool): 是否为模拟运行(只统计不实际删除)。
630
- reorder_id (bool): 去重后是否重排id
631
- use_python_dedup (bool): 是否用Python查找重复id。
660
+ columns (Optional[List[str]]): 指定去重列。
661
+ dry_run (bool): 是否为模拟运行。
662
+ reorder_id (bool): 去重后是否自动重排 id 列。
663
+ use_python_dedup (bool): 是否用 Python 方式去重。
632
664
  Returns:
633
- Tuple[int, int]: (重复组数, 实际删除行数)
665
+ Tuple[int, int]: (重复组数, 实际删除行数)
634
666
  """
635
667
  if database.lower() in self.exclude_tables and table.lower() in self.exclude_tables[database.lower()]:
636
668
  logger.info('表被排除', {"库": database, "表": table, "操作": "跳过"})
@@ -639,10 +671,76 @@ class MySQLDeduplicator:
639
671
  if not self._check_table_exists(database, table):
640
672
  logger.warning('表不存在', {"库": database, "表": table, "warning": "跳过"})
641
673
  return (0, 0)
642
- logger.info('单表开始', {"库": database, "表": table, "参数": {"指定去重列": columns, "模拟运行": dry_run, '排除列': self.exclude_columns, 'use_python_dedup': use_python_dedup}})
643
- result = self._deduplicate_table(database, table, columns, dry_run, use_python_dedup)
644
- logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result})
645
- # 自动重排id列(仅当有实际删除时且reorder_id为True)
674
+ logger.info('单表开始', {
675
+ "库": database,
676
+ "表": table,
677
+ # "参数": {
678
+ # "指定去重列": columns,
679
+ # "去重方式": "Python" if use_python_dedup else "SQL",
680
+ # "数据处理": self.duplicate_keep_mode,
681
+ # "模拟运行": dry_run,
682
+ # '排除列': self.exclude_columns,
683
+ # },
684
+ })
685
+ all_columns = self._get_table_columns(database, table)
686
+ all_columns_lower = [col.lower() for col in all_columns]
687
+ time_col = self.date_column
688
+ time_col_lower = time_col.lower() if time_col else None
689
+ has_time_col = time_col_lower in all_columns_lower if time_col_lower else False
690
+ if has_time_col:
691
+ self._ensure_index(database, table, time_col)
692
+ all_dates = self._get_all_dates(database, table, time_col)
693
+ # 按date_range/recent_month筛选日期
694
+ start_date = self._dedup_start_date
695
+ end_date = self._dedup_end_date
696
+ if start_date and end_date:
697
+ all_dates = [d for d in all_dates if str(start_date) <= str(d) <= str(end_date)]
698
+ if not all_dates:
699
+ logger.info('无可处理日期', {"库": database, "表": table})
700
+ return (0, 0)
701
+ total_dup = 0
702
+ total_del = 0
703
+ def process_date(date_val):
704
+ try:
705
+ logger.debug('按天分区去重', {"库": database, "表": table, "日期": date_val})
706
+ dup_count, affected_rows = self._deduplicate_table(
707
+ database, table, columns, dry_run, use_python_dedup,
708
+ date_val=date_val, lock_table=False
709
+ )
710
+ return (dup_count, affected_rows, date_val, None)
711
+ except Exception as e:
712
+ logger.error('分区去重异常', {"库": database, "表": table, "日期": date_val, "异常": str(e), "func": sys._getframe().f_code.co_name})
713
+ return (0, 0, date_val, str(e))
714
+ if self.max_workers > 1:
715
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
716
+ future_to_date = {executor.submit(process_date, date_val): date_val for date_val in all_dates}
717
+ for future in concurrent.futures.as_completed(future_to_date):
718
+ dup_count, affected_rows, date_val, err = future.result()
719
+ if err:
720
+ logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
721
+ total_dup += dup_count
722
+ total_del += affected_rows
723
+ else:
724
+ for date_val in all_dates:
725
+ dup_count, affected_rows, _, err = process_date(date_val)
726
+ if err:
727
+ logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
728
+ total_dup += dup_count
729
+ total_del += affected_rows
730
+ logger.debug('单表完成', {"库": database, "表": table, "结果[重复, 删除]": (total_dup, total_del), '日期范围': f"{start_date} - {end_date}"})
731
+ # 自动重排id列(仅当有实际删除时且reorder_id为True)
732
+ if reorder_id and total_del > 0:
733
+ try:
734
+ reorder_ok = self.reorder_id_column(database, table, id_column=self.primary_key, dry_run=dry_run)
735
+ logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
736
+ except Exception as e:
737
+ logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
738
+ if affected_rows > 0:
739
+ logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": total_dup, "实际删除": total_del})
740
+ return (total_dup, total_del)
741
+ # 没有date_column,直接全表去重
742
+ result = self._deduplicate_table(database, table, columns, dry_run, use_python_dedup, date_val=None)
743
+ logger.debug('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result, '日期范围': '全表'})
646
744
  dup_count, affected_rows = result
647
745
  if reorder_id and affected_rows > 0:
648
746
  try:
@@ -650,34 +748,36 @@ class MySQLDeduplicator:
650
748
  logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
651
749
  except Exception as e:
652
750
  logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
751
+ if affected_rows > 0:
752
+ logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": dup_count, "实际删除": affected_rows})
653
753
  return result
654
754
  except Exception as e:
655
755
  logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
656
756
  return (0, 0)
657
757
 
658
758
  def deduplicate_database(
659
- self,
660
- database: str,
661
- tables: Optional[List[str]] = None,
662
- columns_map: Optional[Dict[str, List[str]]] = None,
663
- dry_run: bool = False,
664
- parallel: bool = False,
665
- reorder_id: bool = False,
666
- use_python_dedup: bool = True
759
+ self,
760
+ database: str,
761
+ tables: Optional[List[str]] = None,
762
+ columns_map: Optional[Dict[str, List[str]]] = None,
763
+ dry_run: bool = False,
764
+ parallel: bool = False,
765
+ reorder_id: bool = False,
766
+ use_python_dedup: bool = True
667
767
  ) -> Dict[str, Tuple[int, int]]:
668
768
  """
669
- 对指定数据库的所有表进行去重。
769
+ 对指定数据库的所有表进行去重。调用 deduplicate_table,自动适配分天。
670
770
 
671
771
  Args:
672
772
  database (str): 数据库名。
673
- tables (Optional[List[str]]): 要处理的表列表(为None时处理所有表)。
674
- columns_map (Optional[Dict[str, List[str]]]): 各表使用的去重列 {表名: [列名]}。
773
+ tables (Optional[List[str]]): 指定表名列表。
774
+ columns_map (Optional[Dict[str, List[str]]]): 每个表的去重列映射。
675
775
  dry_run (bool): 是否为模拟运行。
676
- parallel (bool): 是否并行处理。
677
- reorder_id (bool): 去重后是否重排id
678
- use_python_dedup (bool): 是否用Python查找重复id。
776
+ parallel (bool): 是否并行处理表。
777
+ reorder_id (bool): 去重后是否自动重排 id 列。
778
+ use_python_dedup (bool): 是否用 Python 方式去重。
679
779
  Returns:
680
- Dict[str, Tuple[int, int]]: {表名: (重复组数, 实际删除行数)}
780
+ Dict[str, Tuple[int, int]]: {表名: (重复组数, 实际删除行数)}
681
781
  """
682
782
  results = {}
683
783
  try:
@@ -693,8 +793,6 @@ class MySQLDeduplicator:
693
793
  return results
694
794
  logger.info('库统计', {"库": database, "表数量": len(target_tables), "表列表": target_tables})
695
795
  if parallel and self.max_workers > 1:
696
- logger.debug('并行处理表', {'库': database, 'max_workers': self.max_workers})
697
- # 使用线程池并行处理
698
796
  with concurrent.futures.ThreadPoolExecutor(
699
797
  max_workers=self.max_workers
700
798
  ) as executor:
@@ -715,8 +813,6 @@ class MySQLDeduplicator:
715
813
  logger.error('异常', {"库": database, "表": table, "error": str(e), 'traceback': repr(e)})
716
814
  results[table] = (0, 0)
717
815
  else:
718
- logger.debug('串行处理表', {'库': database})
719
- # 串行处理
720
816
  for table in target_tables:
721
817
  columns = columns_map.get(table) if columns_map else None
722
818
  dup_count, affected_rows = self.deduplicate_table(
@@ -725,35 +821,39 @@ class MySQLDeduplicator:
725
821
  results[table] = (dup_count, affected_rows)
726
822
  total_dup = sum(r[0] for r in results.values())
727
823
  total_del = sum(r[1] for r in results.values())
728
- logger.info('单库完成', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": results})
824
+ logger.debug('库完成', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": results})
825
+ # 只显示有删除的详细结果
826
+ if total_del > 0:
827
+ filtered_results = {tbl: res for tbl, res in results.items() if res[1] > 0}
828
+ logger.info('库完成(仅显示有删除的结果)', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": filtered_results})
729
829
  return results
730
830
  except Exception as e:
731
831
  logger.error('发生全局错误', {"库": database, 'func': sys._getframe().f_code.co_name, "error": str(e), 'traceback': repr(e)})
732
832
  return results
733
833
 
734
834
  def deduplicate_all(
735
- self,
736
- databases: Optional[List[str]] = None,
737
- tables_map: Optional[Dict[str, List[str]]] = None,
738
- columns_map: Optional[Dict[str, Dict[str, List[str]]]] = None,
739
- dry_run: bool = False,
740
- parallel: bool = False,
741
- reorder_id: bool = False,
742
- use_python_dedup: bool = True
835
+ self,
836
+ databases: Optional[List[str]] = None,
837
+ tables_map: Optional[Dict[str, List[str]]] = None,
838
+ columns_map: Optional[Dict[str, Dict[str, List[str]]]] = None,
839
+ dry_run: bool = False,
840
+ parallel: bool = False,
841
+ reorder_id: bool = False,
842
+ use_python_dedup: bool = True
743
843
  ) -> Dict[str, Dict[str, Tuple[int, int]]]:
744
844
  """
745
- 对所有数据库进行去重。
845
+ 对所有数据库进行去重。调用 deduplicate_database,自动适配分天。
746
846
 
747
847
  Args:
748
- databases (Optional[List[str]]): 要处理的数据库列表。如果为 None,则处理所有非系统数据库。
749
- tables_map (Optional[Dict[str, List[str]]]): 指定每个数据库要处理的表,格式为 {数据库名: [表名, ...]}。如果为 None,则处理所有表。
750
- columns_map (Optional[Dict[str, Dict[str, List[str]]]]): 指定每个表去重时使用的列,格式为 {数据库名: {表名: [列名, ...]}}。如果为 None,则使用所有列。
751
- dry_run (bool): 是否为模拟运行模式。为 True 时只统计重复行数,不实际删除。
752
- parallel (bool): 是否并行处理多个数据库。为 True 时使用线程池并发处理。
753
- reorder_id (bool): 去重后是否重排id
754
- use_python_dedup (bool): 是否用Python查找重复id。
848
+ databases (Optional[List[str]]): 指定数据库名列表。
849
+ tables_map (Optional[Dict[str, List[str]]]): 每个库的表名映射。
850
+ columns_map (Optional[Dict[str, Dict[str, List[str]]]]): 每个库每个表的去重列映射。
851
+ dry_run (bool): 是否为模拟运行。
852
+ parallel (bool): 是否并行处理库。
853
+ reorder_id (bool): 去重后是否自动重排 id 列。
854
+ use_python_dedup (bool): 是否用 Python 方式去重。
755
855
  Returns:
756
- Dict[str, Dict[str, Tuple[int, int]]]: 嵌套字典,格式为 {数据库名: {表名: (重复组数, 实际删除行数)}}
856
+ Dict[str, Dict[str, Tuple[int, int]]]: {库: {表: (重复组数, 实际删除行数)}}
757
857
  """
758
858
  all_results: Dict[str, Dict[str, Tuple[int, int]]] = defaultdict(dict)
759
859
  try:
@@ -763,9 +863,19 @@ class MySQLDeduplicator:
763
863
  if not target_dbs:
764
864
  logger.warning('没有可处理的数据库')
765
865
  return all_results
766
- logger.info('全局开始', {"数据库数量": len(target_dbs), "数据库列表": target_dbs, "参数": {"模拟运行": dry_run, "并行处理": parallel, '排除列': self.exclude_columns, 'use_python_dedup': use_python_dedup}})
767
- if parallel and self.max_workers > 1:
768
- # 使用线程池并行处理多个数据库
866
+ logger.info('全局开始', {
867
+ "数据库数量": len(target_dbs),
868
+ "数据库列表": target_dbs,
869
+ "参数": {
870
+ "模拟运行": dry_run,
871
+ "并行处理": parallel,
872
+ '排除列': self.exclude_columns,
873
+ '重排id': reorder_id,
874
+ 'use_python_dedup': use_python_dedup
875
+ },
876
+ })
877
+ # 如果parallel=True且库数量大于1,则只在外层并发,内层串行
878
+ if parallel and self.max_workers > 1 and len(target_dbs) > 1:
769
879
  with concurrent.futures.ThreadPoolExecutor(
770
880
  max_workers=self.max_workers
771
881
  ) as executor:
@@ -773,6 +883,7 @@ class MySQLDeduplicator:
773
883
  for db in target_dbs:
774
884
  tables = tables_map.get(db) if tables_map else None
775
885
  db_columns_map = columns_map.get(db) if columns_map else None
886
+ # 内层强制串行
776
887
  futures[executor.submit(
777
888
  self.deduplicate_database,
778
889
  db, tables, db_columns_map, dry_run, False, reorder_id, use_python_dedup
@@ -786,7 +897,6 @@ class MySQLDeduplicator:
786
897
  logger.error('异常', {"库": db, "error": str(e), 'traceback': repr(e)})
787
898
  all_results[db] = {}
788
899
  else:
789
- # 串行处理数据库
790
900
  for db in target_dbs:
791
901
  tables = tables_map.get(db) if tables_map else None
792
902
  db_columns_map = columns_map.get(db) if columns_map else None
@@ -802,13 +912,42 @@ class MySQLDeduplicator:
802
912
  r[1] for db in all_results.values()
803
913
  for r in db.values()
804
914
  )
805
- logger.info('全局完成', {"总重复组": total_dup, "总删除行": total_del, "详细结果": dict(all_results)})
915
+ logger.debug('全局完成', {
916
+ "总重复组": total_dup,
917
+ "总删除行": total_del,
918
+ "参数": {
919
+ "模拟运行": dry_run,
920
+ "并行处理": parallel,
921
+ '排除列': self.exclude_columns,
922
+ '重排id': reorder_id,
923
+ 'use_python_dedup': use_python_dedup
924
+ },
925
+ "详细结果": dict(all_results)
926
+ })
927
+ # 只显示有删除的详细结果
928
+ if total_del > 0:
929
+ filtered_results = {
930
+ db: {tbl: res for tbl, res in tbls.items() if res[1] > 0}
931
+ for db, tbls in all_results.items()
932
+ }
933
+ filtered_results = {db: tbls for db, tbls in filtered_results.items() if tbls}
934
+ logger.info('全局完成(仅显示有删除的结果)', {
935
+ "总重复组": total_dup,
936
+ "总删除行": total_del,
937
+ "参数": {
938
+ "模拟运行": dry_run,
939
+ "并行处理": parallel,
940
+ '排除列': self.exclude_columns,
941
+ '重排id': reorder_id,
942
+ 'use_python_dedup': use_python_dedup
943
+ },
944
+ "详细结果": filtered_results
945
+ })
806
946
  return all_results
807
947
  except Exception as e:
808
948
  logger.error('异常', {"error": str(e), 'traceback': repr(e)})
809
949
  return all_results
810
950
 
811
- @_retry_on_failure
812
951
  def _check_database_exists(self, database: str) -> bool:
813
952
  """
814
953
  检查数据库是否存在。
@@ -819,13 +958,11 @@ class MySQLDeduplicator:
819
958
  bool: 数据库是否存在。
820
959
  """
821
960
  sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
822
-
823
- with self._get_connection() as conn:
961
+ with self._conn_ctx() as conn:
824
962
  with conn.cursor() as cursor:
825
963
  cursor.execute(sql, (database,))
826
964
  return bool(cursor.fetchone())
827
965
 
828
- @_retry_on_failure
829
966
  def _check_table_exists(self, database: str, table: str) -> bool:
830
967
  """
831
968
  检查表是否存在。
@@ -841,12 +978,35 @@ class MySQLDeduplicator:
841
978
  FROM INFORMATION_SCHEMA.TABLES
842
979
  WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
843
980
  """
844
-
845
- with self._get_connection() as conn:
981
+ with self._conn_ctx() as conn:
846
982
  with conn.cursor() as cursor:
847
983
  cursor.execute(sql, (database, table))
848
984
  return bool(cursor.fetchone())
849
985
 
986
+ def _get_table_info(self, database: str, table: str, id_column: str = None):
987
+ """
988
+ 获取表的所有列名、主键列名列表、指定id列是否为主键。
989
+ Args:
990
+ database (str): 数据库名。
991
+ table (str): 表名。
992
+ id_column (str): id列名,默认使用self.primary_key。
993
+ Returns:
994
+ Tuple[List[str], List[str], bool]: (所有列名, 主键列名, id列是否为主键)
995
+ """
996
+ id_column = id_column or self.primary_key
997
+ with self._conn_ctx() as conn:
998
+ with conn.cursor() as cursor:
999
+ cursor.execute("""
1000
+ SELECT COLUMN_NAME, COLUMN_KEY
1001
+ FROM INFORMATION_SCHEMA.COLUMNS
1002
+ WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s
1003
+ """, (database, table))
1004
+ columns_info = cursor.fetchall()
1005
+ columns = [row['COLUMN_NAME'] for row in columns_info]
1006
+ pk_cols = [row['COLUMN_NAME'] for row in columns_info if row['COLUMN_KEY'] == 'PRI']
1007
+ id_is_pk = any(row['COLUMN_NAME'].lower() == id_column.lower() and row['COLUMN_KEY'] in ('PRI', 'UNI') for row in columns_info)
1008
+ return columns, pk_cols, id_is_pk
1009
+
850
1010
  def close(self) -> None:
851
1011
  """
852
1012
  关闭连接池。
@@ -895,15 +1055,16 @@ class MySQLDeduplicator:
895
1055
  auto_drop_backup: bool = True
896
1056
  ) -> Any:
897
1057
  """
898
- 安全重排指定表或指定库下所有表的id列为顺序自增(1,2,3...)。
1058
+ 安全重排指定表或指定库下所有表的 id 列为顺序自增(1,2,3...)。
1059
+
899
1060
  Args:
900
- database (str): 数据库名
901
- table (Optional[str]): 表名,None时批量处理该库所有表
902
- id_column (str): id列名,默认"id"
903
- dry_run (bool): 是否为模拟运行
904
- auto_drop_backup (bool): 校验通过后自动删除备份表
1061
+ database (str): 数据库名。
1062
+ table (Optional[str]): 表名,None 时批量处理该库所有表。
1063
+ id_column (str): id 列名,默认 "id"
1064
+ dry_run (bool): 是否为模拟运行。
1065
+ auto_drop_backup (bool): 校验通过后自动删除备份表。
905
1066
  Returns:
906
- bool 或 dict: 单表时bool,批量时{表名: bool}
1067
+ bool 或 dict: 单表时 bool,批量时 {表名: bool}
907
1068
  """
908
1069
  if not table:
909
1070
  # 批量模式,对库下所有表执行
@@ -942,7 +1103,7 @@ class MySQLDeduplicator:
942
1103
  logger.warning('主键不是单列id,跳过id重排', {"库": database, "表": table, "主键列": pk_cols})
943
1104
  return False
944
1105
  # 检查外键约束
945
- with self._get_connection() as conn:
1106
+ with self._conn_ctx() as conn:
946
1107
  with conn.cursor() as cursor:
947
1108
  cursor.execute("""
948
1109
  SELECT * FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
@@ -952,7 +1113,7 @@ class MySQLDeduplicator:
952
1113
  logger.warning('表存在外键约束,跳过id重排', {"库": database, "表": table})
953
1114
  return False
954
1115
  # 获取表结构
955
- with self._get_connection() as conn:
1116
+ with self._conn_ctx() as conn:
956
1117
  with conn.cursor() as cursor:
957
1118
  cursor.execute(f"SHOW CREATE TABLE {table_quoted}")
958
1119
  create_table_sql = cursor.fetchone()['Create Table']
@@ -965,7 +1126,7 @@ class MySQLDeduplicator:
965
1126
  backup_table = self._make_backup_table_name(table)
966
1127
  backup_table_quoted = f"`{database}`.`{backup_table}`"
967
1128
  try:
968
- with self._get_connection() as conn:
1129
+ with self._conn_ctx() as conn:
969
1130
  with conn.cursor() as cursor:
970
1131
  # 1. 创建临时表,结构同原表
971
1132
  try:
@@ -1026,7 +1187,7 @@ class MySQLDeduplicator:
1026
1187
  logger.error('回滚恢复原表失败', {"库": database, "表": table, "异常": str(e)})
1027
1188
  return False
1028
1189
  logger.info('id重排成功且数据量一致', {"库": database, "表": table, "新表": new_cnt, "备份表": old_cnt, "备份表名": backup_table})
1029
- # 5. 可选:自动删除备份表
1190
+ # 5. 自动删除备份表
1030
1191
  if auto_drop_backup:
1031
1192
  try:
1032
1193
  cursor.execute(f"DROP TABLE {backup_table_quoted}")
@@ -1037,7 +1198,7 @@ class MySQLDeduplicator:
1037
1198
  except Exception as e:
1038
1199
  logger.error('id重排异常,准备回滚', {"库": database, "表": table, "异常": str(e)})
1039
1200
  # 回滚:如临时表存在则删掉,恢复原表结构
1040
- with self._get_connection() as conn:
1201
+ with self._conn_ctx() as conn:
1041
1202
  with conn.cursor() as cursor:
1042
1203
  try:
1043
1204
  cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
@@ -1045,7 +1206,7 @@ class MySQLDeduplicator:
1045
1206
  logger.error('回滚时删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
1046
1207
  # 恢复原表(如备份表存在)
1047
1208
  try:
1048
- with self._get_connection() as conn2:
1209
+ with self._conn_ctx() as conn2:
1049
1210
  with conn2.cursor() as cursor2:
1050
1211
  if self._check_table_exists(database, backup_table):
1051
1212
  cursor2.execute(f"DROP TABLE IF EXISTS {table_quoted}")
@@ -1057,6 +1218,41 @@ class MySQLDeduplicator:
1057
1218
  finally:
1058
1219
  self._release_table_lock(database, table)
1059
1220
 
1221
+ def _acquire_table_lock(self, database: str, table: str, timeout: int = 60) -> bool:
1222
+ """
1223
+ 获取表级锁,防止多线程/多进程并发操作同一张表。
1224
+ Args:
1225
+ database (str): 数据库名。
1226
+ table (str): 表名。
1227
+ timeout (int): 等待锁的超时时间(秒)。
1228
+ Returns:
1229
+ bool: 是否成功获取锁。
1230
+ """
1231
+ key = f"{database.lower()}::{table.lower()}"
1232
+ start_time = time.time()
1233
+ while True:
1234
+ with self._lock:
1235
+ if key not in self._processing_tables:
1236
+ self._processing_tables.add(key)
1237
+ return True
1238
+ if time.time() - start_time > timeout:
1239
+ logger.warning('获取表级锁超时', {"库": database, "表": table, "timeout": timeout})
1240
+ return False
1241
+ time.sleep(0.2)
1242
+
1243
+ def _release_table_lock(self, database: str, table: str) -> None:
1244
+ """
1245
+ 释放表级锁。
1246
+ Args:
1247
+ database (str): 数据库名。
1248
+ table (str): 表名。
1249
+ Returns:
1250
+ None
1251
+ """
1252
+ key = f"{database.lower()}::{table.lower()}"
1253
+ with self._lock:
1254
+ self._processing_tables.discard(key)
1255
+
1060
1256
  @staticmethod
1061
1257
  def _make_safe_table_name(base: str, prefix: str = '', suffix: str = '', max_length: int = 64) -> str:
1062
1258
  """
@@ -1077,30 +1273,6 @@ class MySQLDeduplicator:
1077
1273
  return (prefix + suffix)[:max_length]
1078
1274
  return f"{prefix}{base[:remain]}{suffix}"[:max_length]
1079
1275
 
1080
- def _get_table_info(self, database: str, table: str, id_column: str = None):
1081
- """
1082
- 获取表的所有列名、主键列名列表、指定id列是否为主键。
1083
- Args:
1084
- database (str): 数据库名。
1085
- table (str): 表名。
1086
- id_column (str): id列名,默认使用self.primary_key。
1087
- Returns:
1088
- Tuple[List[str], List[str], bool]: (所有列名, 主键列名, id列是否为主键)
1089
- """
1090
- id_column = id_column or self.primary_key
1091
- with self._get_connection() as conn:
1092
- with conn.cursor() as cursor:
1093
- cursor.execute("""
1094
- SELECT COLUMN_NAME, COLUMN_KEY
1095
- FROM INFORMATION_SCHEMA.COLUMNS
1096
- WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s
1097
- """, (database, table))
1098
- columns_info = cursor.fetchall()
1099
- columns = [row['COLUMN_NAME'] for row in columns_info]
1100
- pk_cols = [row['COLUMN_NAME'] for row in columns_info if row['COLUMN_KEY'] == 'PRI']
1101
- id_is_pk = any(row['COLUMN_NAME'].lower() == id_column.lower() and row['COLUMN_KEY'] in ('PRI', 'UNI') for row in columns_info)
1102
- return columns, pk_cols, id_is_pk
1103
-
1104
1276
  def _make_temp_table_name(self, base: str) -> str:
1105
1277
  """
1106
1278
  生成临时表名,带有 temp_ 前缀和 _dedup_ 进程线程后缀。
@@ -1122,26 +1294,35 @@ def main():
1122
1294
  password='pwd',
1123
1295
  host='localhost',
1124
1296
  port=3306,
1125
- # date_range=['2025-05-27', '2025-05-28'],
1126
- exclude_tables={'推广数据2': [
1127
- # '地域报表_城市_2025_04',
1128
- # '地域报表_城市_2025_05',
1129
- # '地域报表_城市_2025_06',
1130
- '地域报表_城市_2025_04_copy1',
1131
- '地域报表_城市_2025_05_copy1',
1132
- '地域报表_城市_2025_06_copy1',
1133
- '主体报表_2025_copy1'
1134
- ]}
1297
+ max_workers= 2,
1298
+ batch_size=1000,
1299
+ skip_system_dbs=True,
1300
+ max_retries=3,
1301
+ retry_waiting_time=5,
1302
+ # pool_size=30,
1303
+ recent_month=1,
1304
+ # date_range=['2025-06-09', '2025-06-10'],
1305
+ date_column='日期',
1306
+ exclude_databases=['测试库4'],
1307
+ exclude_tables={
1308
+ '推广数据2': [
1309
+ '地域报表_城市_2025_04',
1310
+ # '地域报表_城市_2025_04_copy1',
1311
+ ],
1312
+ "生意参谋3": [
1313
+ "商品排行_2025",
1314
+ ],
1315
+ },
1135
1316
  )
1136
1317
 
1137
1318
  # 全库去重(单线程)
1138
- deduplicator.deduplicate_all(dry_run=True, parallel=True, reorder_id=True)
1319
+ deduplicator.deduplicate_all(dry_run=False, parallel=True, reorder_id=True)
1139
1320
 
1140
1321
  # # 指定数据库去重(多线程)
1141
- # deduplicator.deduplicate_database('my_db', dry_run=False, parallel=False, reorder_id=True)
1322
+ # deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True, reorder_id=True)
1142
1323
 
1143
1324
  # # 指定表去重(使用特定列)
1144
- # deduplicator.deduplicate_table('my_db', 'my_table', columns=["name", "date"], dry_run=False, reorder_id=False)
1325
+ # deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'data'], dry_run=False, reorder_id=True)
1145
1326
 
1146
1327
  # # 重排id列
1147
1328
  # deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 3.11.10
3
+ Version: 3.12.0
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,5 +1,5 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=L9HK2W1LgO8Zc5gpJgI1uJ5J0VRcUyMXHr1ZT-FeNOM,19
2
+ mdbq/__version__.py,sha256=W8WVhYkHLU0SBDlL9Q6XQVTqIrzYjc1kFBZgqzS_NEI,18
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
4
  mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
5
5
  mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
@@ -8,7 +8,7 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
8
8
  mdbq/log/mylogger.py,sha256=Crw6LwVo3I3IUbzIETu8f46Quza3CTCh-qYf4edbBPo,24139
9
9
  mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
10
10
  mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
11
- mdbq/mysql/deduplicator.py,sha256=w8etA5dAsY7g58bWU3SQt7n_OWnS9Y2TVh0D7m0MK9E,57961
11
+ mdbq/mysql/deduplicator.py,sha256=KMJ_YyqAniaLVRqOHLgO92PgwknIDB-EgaOY7S6iMZ4,68599
12
12
  mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
13
13
  mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
14
14
  mdbq/mysql/uploader.py,sha256=8Px_W2bYOr1wQgMXMK0DggNiuE6a6Ul4BlJake8LSo8,64469
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
24
24
  mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
25
25
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
26
26
  mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
27
- mdbq-3.11.10.dist-info/METADATA,sha256=dVhkC84iq1GWtV6onfsLj18CwfGnIo1bXXDa-TXUU1E,365
28
- mdbq-3.11.10.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
- mdbq-3.11.10.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
- mdbq-3.11.10.dist-info/RECORD,,
27
+ mdbq-3.12.0.dist-info/METADATA,sha256=Q6EyaC61H4okFva6YFV2a0Y3Iqun8L8mnpSkeVXcFdc,364
28
+ mdbq-3.12.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
+ mdbq-3.12.0.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
+ mdbq-3.12.0.dist-info/RECORD,,
File without changes