mdbq 3.11.9__py3-none-any.whl → 3.11.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.11.9'
1
+ VERSION = '3.11.11'
@@ -34,32 +34,6 @@ logger = mylogger.MyLogger(
34
34
  class MySQLDeduplicator:
35
35
  """
36
36
  MySQL数据去重
37
-
38
- 功能:
39
- 1. 自动检测并删除MySQL数据库中的重复数据
40
- 2. 支持全库扫描或指定表处理
41
- 3. 支持多线程/多进程安全处理
42
- 4. 完善的错误处理和日志记录
43
-
44
- 使用示例:
45
- deduplicator = MySQLDeduplicator(
46
- username='root',
47
- password='password',
48
- host='localhost',
49
- port=3306
50
- )
51
-
52
- # 全库去重
53
- deduplicator.deduplicate_all()
54
-
55
- # 指定数据库去重(多线程)
56
- deduplicator.deduplicate_database('my_db', parallel=True)
57
-
58
- # 指定表去重(使用特定列)
59
- deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
60
-
61
- # 关闭连接
62
- deduplicator.close()
63
37
  """
64
38
 
65
39
  def __init__(
@@ -69,7 +43,7 @@ class MySQLDeduplicator:
69
43
  host: str = 'localhost',
70
44
  port: int = 3306,
71
45
  charset: str = 'utf8mb4',
72
- max_workers: int = 1,
46
+ max_workers: int = 2,
73
47
  batch_size: int = 1000,
74
48
  skip_system_dbs: bool = True,
75
49
  max_retries: int = 3,
@@ -121,27 +95,17 @@ class MySQLDeduplicator:
121
95
  self.retry_interval = retry_interval
122
96
  self.primary_key = primary_key
123
97
 
124
- # 时间范围参数
125
- self.date_range = date_range
126
- self.recent_month = recent_month
98
+ # 时间范围参数(只保留解析后的结果,去除冗余原始参数)
127
99
  self.date_column = date_column
128
100
  self._dedup_start_date = None
129
101
  self._dedup_end_date = None
130
- # 不管 exclude_columns 是否传入, 'id' 一定会被排除
131
- default_exclude = {'id'}
132
- # exclude_columns 不传则排除: ['id', '更新时间']
133
- if not exclude_columns:
134
- self.exclude_columns = list(default_exclude | {'更新时间'})
135
- else:
136
- self.exclude_columns = list(set(exclude_columns) | default_exclude)
137
- # 解析时间范围并智能校正date_range
138
- if self.date_range and len(self.date_range) == 2:
102
+ if date_range and len(date_range) == 2:
139
103
  try:
140
- start, end = self.date_range
104
+ start, end = date_range
141
105
  start_dt = datetime.strptime(start, "%Y-%m-%d")
142
106
  end_dt = datetime.strptime(end, "%Y-%m-%d")
143
107
  if start_dt > end_dt:
144
- logger.warning(
108
+ logger.debug(
145
109
  "date_range顺序不正确,自动交换开始和结束日期。",
146
110
  {"start": start, "end": end}
147
111
  )
@@ -151,13 +115,13 @@ class MySQLDeduplicator:
151
115
  except Exception as e:
152
116
  logger.error(
153
117
  "date_range参数格式错误,应为['YYYY-MM-DD', 'YYYY-MM-DD'],已忽略时间范围。",
154
- {"date_range": self.date_range, "error": str(e)}
118
+ {"date_range": date_range, "error": str(e)}
155
119
  )
156
120
  self._dedup_start_date = None
157
121
  self._dedup_end_date = None
158
- elif self.recent_month:
122
+ elif recent_month:
159
123
  today = datetime.today()
160
- month = today.month - self.recent_month
124
+ month = today.month - recent_month
161
125
  year = today.year
162
126
  while month <= 0:
163
127
  month += 12
@@ -165,16 +129,19 @@ class MySQLDeduplicator:
165
129
  self._dedup_start_date = f"{year}-{month:02d}-01"
166
130
  self._dedup_end_date = today.strftime("%Y-%m-%d")
167
131
 
132
+ # 排除列处理,直接合并去重
133
+ self.exclude_columns = list(set((exclude_columns or []) + ['id', '更新时间']))
134
+
168
135
  # 线程安全控制
169
136
  self._lock = threading.Lock()
170
137
  self._processing_tables = set() # 正在处理的表集合
171
138
 
172
139
  # 系统数据库列表
173
- self.SYSTEM_DATABASES = {'information_schema', 'mysql', 'performance_schema', 'sys'}
140
+ self.SYSTEM_DATABASES = {'information_schema', 'mysql', 'performance_schema', 'sys', 'sakila'}
174
141
 
175
142
  # 排除数据库和表的逻辑
176
- self.exclude_databases = set([db.lower() for db in exclude_databases]) if exclude_databases else set()
177
- self.exclude_tables = {k.lower(): set([t.lower() for t in v]) for k, v in (exclude_tables or {}).items()}
143
+ self.exclude_databases = set(db.lower() for db in (exclude_databases or []))
144
+ self.exclude_tables = {k.lower(): set(t.lower() for t in v) for k, v in (exclude_tables or {}).items()}
178
145
 
179
146
  self.duplicate_keep_mode = duplicate_keep_mode if duplicate_keep_mode in ('keep_one', 'remove_all') else 'keep_one'
180
147
 
@@ -215,7 +182,7 @@ class MySQLDeduplicator:
215
182
  last_exception = None
216
183
  for attempt in range(self.max_retries + 1):
217
184
  try:
218
- logger.debug(f'调用{func.__name__},第{attempt+1}次尝试', {'args': args, 'kwargs': kwargs})
185
+ logger.debug(f'调用{func.__name__},第{attempt+1}次连接', {'args': args, 'kwargs': kwargs})
219
186
  return func(self, *args, **kwargs)
220
187
  except (pymysql.OperationalError, pymysql.InterfaceError) as e:
221
188
  last_exception = e
@@ -239,7 +206,7 @@ class MySQLDeduplicator:
239
206
  @_retry_on_failure
240
207
  def _get_databases(self) -> List[str]:
241
208
  """
242
- 获取所有非系统数据库列表,排除exclude_databases。
209
+ 获取所有非系统数据库列表,排除 exclude_databases。
243
210
 
244
211
  Returns:
245
212
  List[str]: 数据库名列表。
@@ -256,7 +223,7 @@ class MySQLDeduplicator:
256
223
  @_retry_on_failure
257
224
  def _get_tables(self, database: str) -> List[str]:
258
225
  """
259
- 获取指定数据库的所有表名。
226
+ 获取指定数据库的所有表名(排除 temp_ 前缀的临时表)。
260
227
 
261
228
  Args:
262
229
  database (str): 数据库名。
@@ -296,43 +263,11 @@ class MySQLDeduplicator:
296
263
  return [row['COLUMN_NAME'] for row in cursor.fetchall()
297
264
  if row['COLUMN_NAME'].lower() != self.primary_key.lower()]
298
265
 
299
- def _acquire_table_lock(self, database: str, table: str) -> bool:
300
- """
301
- 获取表处理锁,防止并发处理同一张表。
302
-
303
- Args:
304
- database (str): 数据库名。
305
- table (str): 表名。
306
- Returns:
307
- bool: 是否成功获取锁。
308
- """
309
- key = f"{database}.{table}"
310
-
311
- with self._lock:
312
- if key in self._processing_tables:
313
- logger.debug(f"表 {key} 正在被其他线程处理,跳过")
314
- return False
315
- self._processing_tables.add(key)
316
- return True
317
-
318
- def _release_table_lock(self, database: str, table: str) -> None:
319
- """
320
- 释放表处理锁。
321
-
322
- Args:
323
- database (str): 数据库名。
324
- table (str): 表名。
325
- """
326
- key = f"{database}.{table}"
327
-
328
- with self._lock:
329
- if key in self._processing_tables:
330
- self._processing_tables.remove(key)
331
-
332
266
  @_retry_on_failure
333
267
  def _ensure_index(self, database: str, table: str, date_column: str) -> None:
334
268
  """
335
- 检查并为date_column自动创建索引(如果未存在)。
269
+ 检查并为 date_column 自动创建索引(如果未存在)。
270
+
336
271
  Args:
337
272
  database (str): 数据库名。
338
273
  table (str): 表名。
@@ -356,121 +291,241 @@ class MySQLDeduplicator:
356
291
  try:
357
292
  cursor.execute(f"CREATE INDEX `{safe_index_name}` ON `{database}`.`{table}` (`{date_column}`)")
358
293
  conn.commit()
359
- logger.info('已自动为date_column创建索引', {"库": database, "表": table, "date_column": date_column, "索引名": safe_index_name})
294
+ logger.debug('已自动为date_column创建索引', {"库": database, "表": table, "date_column": date_column, "索引名": safe_index_name})
360
295
  except Exception as e:
361
296
  logger.error('自动创建date_column索引失败', {"库": database, "表": table, "date_column": date_column, "异常": str(e)})
362
- else:
363
- logger.debug('date_column已存在索引', {"库": database, "表": table, "date_column": date_column})
297
+
298
+ @_retry_on_failure
299
+ def _get_all_dates(self, database: str, table: str, date_column: str) -> List[str]:
300
+ """
301
+ 获取表中所有不同的日期分区(按天)。
302
+
303
+ Args:
304
+ database (str): 数据库名。
305
+ table (str): 表名。
306
+ date_column (str): 日期列名。
307
+ Returns:
308
+ List[str]: 所有不同的日期(字符串)。
309
+ """
310
+ sql = f"SELECT DISTINCT `{date_column}` FROM `{database}`.`{table}` ORDER BY `{date_column}` ASC"
311
+ with self._get_connection() as conn:
312
+ with conn.cursor() as cursor:
313
+ cursor.execute(sql)
314
+ return [row[date_column] for row in cursor.fetchall() if row[date_column] is not None]
364
315
 
365
316
  def _deduplicate_table(
366
- self,
367
- database: str,
368
- table: str,
369
- columns: Optional[List[str]] = None,
370
- dry_run: bool = False,
371
- use_python_dedup: bool = False
317
+ self,
318
+ database: str,
319
+ table: str,
320
+ columns: Optional[List[str]] = None,
321
+ dry_run: bool = False,
322
+ use_python_dedup: bool = False,
323
+ date_val: Optional[str] = None,
324
+ lock_table: bool = True
372
325
  ) -> Tuple[int, int]:
373
326
  """
374
- 执行单表去重。
375
- 支持按天分批处理(如果表包含date_column),否则全表去重。
376
- 如果date_column在exclude_columns中,直接跳过该表。
377
- 优化:分批删除时用主键、避免重复建/删临时表、并发处理每天。
327
+ 执行单表单天去重。只处理 date_val 这一天的数据(如果有 date_column),否则全表。
328
+
329
+ Args:
330
+ database (str): 数据库名。
331
+ table (str): 表名。
332
+ columns (Optional[List[str]]): 指定去重列。
333
+ dry_run (bool): 是否为模拟运行。
334
+ use_python_dedup (bool): 是否用 Python 方式去重。
335
+ date_val (Optional[str]): 指定处理的日期(如有 date_column)。
336
+ lock_table (bool): 是否加表级锁。
337
+ Returns:
338
+ Tuple[int, int]: (重复组数, 实际删除行数)
378
339
  """
379
- if not self._acquire_table_lock(database, table):
340
+ if lock_table and not self._acquire_table_lock(database, table):
380
341
  return (0, 0)
381
342
  temp_table = None
382
343
  try:
383
- # 获取实际列名
384
344
  all_columns = self._get_table_columns(database, table)
385
345
  all_columns_lower = [col.lower() for col in all_columns]
386
346
  exclude_columns_lower = [col.lower() for col in getattr(self, 'exclude_columns', [])]
387
347
  time_col = self.date_column
388
348
  time_col_lower = time_col.lower() if time_col else None
389
- # 1. 跳过date_column在exclude_columns的情况
390
349
  if time_col_lower and time_col_lower in exclude_columns_lower:
391
350
  logger.warning('date_column在exclude_columns中,跳过该表', {"库": database, "表": table, "date_column": time_col, "exclude_columns": self.exclude_columns})
392
351
  return (0, 0)
393
- # 2. 判断表是否包含date_column
394
352
  has_time_col = time_col_lower in all_columns_lower if time_col_lower else False
395
- # 如果包含date_column,自动检查并创建索引
396
- if has_time_col:
353
+
354
+ # 只要有date_column,始终分天处理(本函数只处理一天)
355
+ if has_time_col and date_val is not None:
397
356
  self._ensure_index(database, table, time_col)
398
- # 3. 获取去重列
357
+ # 获取去重列
358
+ use_columns = columns or all_columns
359
+ use_columns = [col for col in use_columns if col.lower() in all_columns_lower and col.lower() not in exclude_columns_lower]
360
+ invalid_columns = set([col for col in (columns or []) if col.lower() not in all_columns_lower])
361
+ if invalid_columns:
362
+ logger.warning('不存在的列', {"库": database, "表": table, "不存在以下列": invalid_columns, 'func': sys._getframe().f_code.co_name})
363
+ if not use_columns:
364
+ logger.error('没有有效的去重列', {"库": database, "表": table, "func": sys._getframe().f_code.co_name})
365
+ return (0, 0)
366
+ pk = self.primary_key
367
+ pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
368
+ where_sql = f"t.`{time_col}` = '{date_val}'"
369
+ # 获取原始数据总量(只统计当天数据)
370
+ with self._get_connection() as conn:
371
+ with conn.cursor() as cursor:
372
+ count_where = f"WHERE `{time_col}` = '{date_val}'"
373
+ count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}` {count_where}"
374
+ logger.debug('执行SQL', {'sql': count_sql})
375
+ cursor.execute(count_sql)
376
+ total_count_row = cursor.fetchone()
377
+ total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
378
+ logger.debug('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name, "数据日期": date_val})
379
+ column_list = ', '.join([f'`{col}`' for col in use_columns])
380
+
381
+ # 用Python查找重复
382
+ if use_python_dedup:
383
+ select_cols = f'`{pk_real}`,' + ','.join([f'`{col}`' for col in use_columns])
384
+ select_where = f"WHERE `{time_col}` = '{date_val}'"
385
+ grouped = defaultdict(list)
386
+ for row in self._row_generator(database, table, select_cols, select_where, self.batch_size):
387
+ key = tuple(row[col] for col in use_columns)
388
+ grouped[key].append(row[pk_real])
389
+ dup_count = 0
390
+ del_ids = []
391
+ for ids in grouped.values():
392
+ if len(ids) > 1:
393
+ dup_count += 1
394
+ del_ids.extend(ids[1:])
395
+ affected_rows = 0
396
+ if not dry_run and del_ids:
397
+ with self._get_connection() as conn:
398
+ with conn.cursor() as cursor:
399
+ for i in range(0, len(del_ids), self.batch_size):
400
+ batch_ids = del_ids[i:i+self.batch_size]
401
+ del_ids_str = ','.join([str(i) for i in batch_ids])
402
+ delete_sql = f"DELETE FROM `{database}`.`{table}` WHERE `{pk_real}` IN ({del_ids_str})"
403
+ cursor.execute(delete_sql)
404
+ batch_deleted = cursor.rowcount
405
+ affected_rows += batch_deleted
406
+ conn.commit()
407
+ logger.debug('去重完成', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "Python", "数据处理": self.duplicate_keep_mode, "数据日期": date_val})
408
+ return (dup_count, affected_rows)
409
+ # SQL方式查找重复
410
+ temp_table = self._make_temp_table_name(table)
411
+ drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
412
+ create_temp_where = f"WHERE `{time_col}` = '{date_val}'"
413
+ create_temp_sql = f"""
414
+ CREATE TABLE `{database}`.`{temp_table}` AS
415
+ SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
416
+ FROM `{database}`.`{table}`
417
+ {create_temp_where}
418
+ GROUP BY {column_list}
419
+ HAVING COUNT(*) > 1
420
+ """
421
+ with self._get_connection() as conn:
422
+ with conn.cursor() as cursor:
423
+ logger.debug('创建临时表SQL', {'sql': create_temp_sql})
424
+ cursor.execute(create_temp_sql)
425
+ cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`")
426
+ dup_count_row = cursor.fetchone()
427
+ dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
428
+ if dup_count == 0:
429
+ logger.debug('没有重复数据', {"库": database, "表": table, "数据量": total_count, "数据日期": date_val})
430
+ cursor.execute(drop_temp_sql)
431
+ conn.commit()
432
+ return (0, 0)
433
+ affected_rows = 0
434
+ if not dry_run:
435
+ while True:
436
+ where_clauses = []
437
+ if self.duplicate_keep_mode == 'keep_one':
438
+ where_clauses.append(f"t.`{pk_real}` <> tmp.`min_id`")
439
+ if where_sql.strip():
440
+ where_clauses.append(where_sql.strip())
441
+ where_full = "WHERE " + " AND ".join(where_clauses) if where_clauses else ""
442
+ find_dup_ids_sql = f"""
443
+ SELECT t.`{pk_real}` as del_id
444
+ FROM `{database}`.`{table}` t
445
+ JOIN `{database}`.`{temp_table}` tmp
446
+ ON {' AND '.join([f't.`{col}` <=> tmp.`{col}`' for col in use_columns])}
447
+ {where_full}
448
+ LIMIT {self.batch_size}
449
+ """
450
+ logger.debug('查找待删除重复id SQL', {'sql': find_dup_ids_sql})
451
+ cursor.execute(find_dup_ids_sql)
452
+ del_ids = [row['del_id'] for row in cursor.fetchall()]
453
+ if not del_ids:
454
+ break
455
+ del_ids_str = ','.join([str(i) for i in del_ids])
456
+ delete_sql = f"DELETE FROM `{database}`.`{table}` WHERE `{pk_real}` IN ({del_ids_str})"
457
+ logger.debug('按id批量删除SQL', {'sql': delete_sql, 'ids': del_ids})
458
+ cursor.execute(delete_sql)
459
+ batch_deleted = cursor.rowcount
460
+ affected_rows += batch_deleted
461
+ conn.commit()
462
+ if batch_deleted == 0:
463
+ logger.warning('检测到未能删除任何数据,强制跳出循环,防止假死', {"库": database, "表": table})
464
+ break
465
+ if batch_deleted < self.batch_size:
466
+ break
467
+ logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "SQL", "数据处理": self.duplicate_keep_mode, "数据日期": date_val})
468
+ else:
469
+ logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组": dup_count})
470
+ affected_rows = 0
471
+ cursor.execute(drop_temp_sql)
472
+ conn.commit()
473
+ return (dup_count, affected_rows)
474
+ # 没有date_column,处理全表
475
+ # ...existing code for full-table deduplication (as before, but without recursion)...
399
476
  use_columns = columns or all_columns
400
477
  use_columns = [col for col in use_columns if col.lower() in all_columns_lower and col.lower() not in exclude_columns_lower]
401
478
  invalid_columns = set([col for col in (columns or []) if col.lower() not in all_columns_lower])
402
479
  if invalid_columns:
403
480
  logger.warning('不存在的列', {"库": database, "表": table, "不存在以下列": invalid_columns, 'func': sys._getframe().f_code.co_name})
404
481
  if not use_columns:
405
- logger.error('没有有效的去重列', {"库": database, "表": table})
482
+ logger.error('没有有效的去重列', {"库": database, "表": table, "func": sys._getframe().f_code.co_name})
406
483
  return (0, 0)
407
484
  pk = self.primary_key
408
485
  pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
409
- # 判断是否需要加日期区间条件
410
- where_sql = ''
411
- if has_time_col and self._dedup_start_date and self._dedup_end_date:
412
- where_sql = f"t.`{time_col}` >= '{self._dedup_start_date}' AND t.`{time_col}` <= '{self._dedup_end_date}'"
413
- # 获取原始数据总量(只统计区间内数据)
486
+ # 获取原始数据总量
414
487
  with self._get_connection() as conn:
415
488
  with conn.cursor() as cursor:
416
- count_where = f"WHERE `{time_col}` >= '{self._dedup_start_date}' AND `{time_col}` <= '{self._dedup_end_date}'" if has_time_col and self._dedup_start_date and self._dedup_end_date else ''
417
- count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}` {count_where}"
489
+ count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`"
418
490
  logger.debug('执行SQL', {'sql': count_sql})
419
491
  cursor.execute(count_sql)
420
492
  total_count_row = cursor.fetchone()
421
493
  total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
422
- logger.info('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name})
494
+ logger.debug('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name})
423
495
  column_list = ', '.join([f'`{col}`' for col in use_columns])
424
-
425
- # 用Python查找重复
426
496
  if use_python_dedup:
427
- from collections import defaultdict
428
- # 1. 拉取所有数据
429
497
  select_cols = f'`{pk_real}`,' + ','.join([f'`{col}`' for col in use_columns])
430
- select_where = f"WHERE `{time_col}` >= '{self._dedup_start_date}' AND `{time_col}` <= '{self._dedup_end_date}'" if has_time_col and self._dedup_start_date and self._dedup_end_date else ''
431
- select_sql = f"SELECT {select_cols} FROM `{database}`.`{table}` {select_where}"
432
- logger.debug('用Python查找重复,拉取数据SQL', {'sql': select_sql})
433
- with self._get_connection() as conn:
434
- with conn.cursor() as cursor:
435
- cursor.execute(select_sql)
436
- rows = cursor.fetchall()
437
- # 2. 分组找重复
498
+ select_where = ''
438
499
  grouped = defaultdict(list)
439
- for row in rows:
500
+ for row in self._row_generator(database, table, select_cols, select_where, self.batch_size):
440
501
  key = tuple(row[col] for col in use_columns)
441
502
  grouped[key].append(row[pk_real])
442
- # 3. 统计重复组和待删除id
443
503
  dup_count = 0
444
504
  del_ids = []
445
505
  for ids in grouped.values():
446
506
  if len(ids) > 1:
447
507
  dup_count += 1
448
- del_ids.extend(ids[1:]) # 只保留第一个
508
+ del_ids.extend(ids[1:])
449
509
  affected_rows = 0
450
510
  if not dry_run and del_ids:
451
511
  with self._get_connection() as conn:
452
512
  with conn.cursor() as cursor:
453
513
  for i in range(0, len(del_ids), self.batch_size):
454
- batch = del_ids[i:i+self.batch_size]
455
- del_ids_str = ','.join([str(i) for i in batch])
514
+ batch_ids = del_ids[i:i+self.batch_size]
515
+ del_ids_str = ','.join([str(i) for i in batch_ids])
456
516
  delete_sql = f"DELETE FROM `{database}`.`{table}` WHERE `{pk_real}` IN ({del_ids_str})"
457
- logger.debug('用Python分批删除SQL', {'sql': delete_sql, 'ids': batch})
458
517
  cursor.execute(delete_sql)
459
518
  batch_deleted = cursor.rowcount
460
519
  affected_rows += batch_deleted
461
520
  conn.commit()
462
- logger.info('用Python去重完成', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "去重模式": self.duplicate_keep_mode, "实际去重列": use_columns})
521
+ logger.debug('去重完成', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "Python", "数据处理": self.duplicate_keep_mode})
463
522
  return (dup_count, affected_rows)
464
- # SQL方式查找重复
465
- temp_table = self._make_safe_table_name(table, prefix=f"temp_", suffix=f"_dedup_{os.getpid()}_{threading.get_ident()}")
523
+ temp_table = self._make_temp_table_name(table)
466
524
  drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
467
- # 创建临时表时加where条件
468
- create_temp_where = f"WHERE `{time_col}` >= '{self._dedup_start_date}' AND `{time_col}` <= '{self._dedup_end_date}'" if has_time_col and self._dedup_start_date and self._dedup_end_date else ''
469
525
  create_temp_sql = f"""
470
526
  CREATE TABLE `{database}`.`{temp_table}` AS
471
527
  SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
472
528
  FROM `{database}`.`{table}`
473
- {create_temp_where}
474
529
  GROUP BY {column_list}
475
530
  HAVING COUNT(*) > 1
476
531
  """
@@ -482,7 +537,7 @@ class MySQLDeduplicator:
482
537
  dup_count_row = cursor.fetchone()
483
538
  dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
484
539
  if dup_count == 0:
485
- logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count, "实际去重列": use_columns})
540
+ logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count})
486
541
  cursor.execute(drop_temp_sql)
487
542
  conn.commit()
488
543
  return (0, 0)
@@ -492,8 +547,6 @@ class MySQLDeduplicator:
492
547
  where_clauses = []
493
548
  if self.duplicate_keep_mode == 'keep_one':
494
549
  where_clauses.append(f"t.`{pk_real}` <> tmp.`min_id`")
495
- if where_sql.strip():
496
- where_clauses.append(where_sql.strip())
497
550
  where_full = "WHERE " + " AND ".join(where_clauses) if where_clauses else ""
498
551
  find_dup_ids_sql = f"""
499
552
  SELECT t.`{pk_real}` as del_id
@@ -520,16 +573,15 @@ class MySQLDeduplicator:
520
573
  break
521
574
  if batch_deleted < self.batch_size:
522
575
  break
523
- logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "去重模式": self.duplicate_keep_mode, "实际去重列": use_columns})
576
+ logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "SQL", "数据处理": self.duplicate_keep_mode})
524
577
  else:
525
- logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组数": dup_count})
578
+ logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组": dup_count})
526
579
  affected_rows = 0
527
580
  cursor.execute(drop_temp_sql)
528
581
  conn.commit()
529
582
  return (dup_count, affected_rows)
530
583
  except Exception as e:
531
584
  logger.error('异常', {"库": database, "表": table, "异常": str(e), 'func': sys._getframe().f_code.co_name, 'traceback': repr(e)})
532
- # 异常时也要清理临时表
533
585
  if temp_table:
534
586
  try:
535
587
  with self._get_connection() as conn:
@@ -541,29 +593,30 @@ class MySQLDeduplicator:
541
593
  logger.error('异常时清理临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
542
594
  return (0, 0)
543
595
  finally:
544
- self._release_table_lock(database, table)
596
+ if lock_table:
597
+ self._release_table_lock(database, table)
545
598
 
546
599
  def deduplicate_table(
547
- self,
548
- database: str,
549
- table: str,
550
- columns: Optional[List[str]] = None,
551
- dry_run: bool = False,
552
- reorder_id: bool = False,
553
- use_python_dedup: bool = True
600
+ self,
601
+ database: str,
602
+ table: str,
603
+ columns: Optional[List[str]] = None,
604
+ dry_run: bool = False,
605
+ reorder_id: bool = False,
606
+ use_python_dedup: bool = True
554
607
  ) -> Tuple[int, int]:
555
608
  """
556
- 对指定表进行去重。
609
+ 对指定表进行去重。始终按天分区(如有 date_column),否则全表。
557
610
 
558
611
  Args:
559
612
  database (str): 数据库名。
560
613
  table (str): 表名。
561
- columns (Optional[List[str]]): 用于去重的列名列表(为None时使用所有列)。
562
- dry_run (bool): 是否为模拟运行(只统计不实际删除)。
563
- reorder_id (bool): 去重后是否重排id
564
- use_python_dedup (bool): 是否用Python查找重复id。
614
+ columns (Optional[List[str]]): 指定去重列。
615
+ dry_run (bool): 是否为模拟运行。
616
+ reorder_id (bool): 去重后是否自动重排 id 列。
617
+ use_python_dedup (bool): 是否用 Python 方式去重。
565
618
  Returns:
566
- Tuple[int, int]: (重复组数, 实际删除行数)
619
+ Tuple[int, int]: (重复组数, 实际删除行数)
567
620
  """
568
621
  if database.lower() in self.exclude_tables and table.lower() in self.exclude_tables[database.lower()]:
569
622
  logger.info('表被排除', {"库": database, "表": table, "操作": "跳过"})
@@ -572,10 +625,73 @@ class MySQLDeduplicator:
572
625
  if not self._check_table_exists(database, table):
573
626
  logger.warning('表不存在', {"库": database, "表": table, "warning": "跳过"})
574
627
  return (0, 0)
575
- logger.info('单表开始', {"库": database, "表": table, "参数": {"指定去重列": columns, "模拟运行": dry_run, '排除列': self.exclude_columns, 'use_python_dedup': use_python_dedup}})
576
- result = self._deduplicate_table(database, table, columns, dry_run, use_python_dedup)
577
- logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result})
578
- # 自动重排id列(仅当有实际删除时且reorder_id为True)
628
+ logger.info('单表开始', {
629
+ "库": database,
630
+ "表": table,
631
+ "参数": {
632
+ "指定去重列": columns,
633
+ "去重方式": "Python" if use_python_dedup else "SQL",
634
+ "数据处理": self.duplicate_keep_mode,
635
+ "模拟运行": dry_run,
636
+ '排除列': self.exclude_columns,
637
+ }})
638
+ all_columns = self._get_table_columns(database, table)
639
+ all_columns_lower = [col.lower() for col in all_columns]
640
+ time_col = self.date_column
641
+ time_col_lower = time_col.lower() if time_col else None
642
+ has_time_col = time_col_lower in all_columns_lower if time_col_lower else False
643
+ if has_time_col:
644
+ self._ensure_index(database, table, time_col)
645
+ all_dates = self._get_all_dates(database, table, time_col)
646
+ # 按date_range/recent_month筛选日期
647
+ start_date = self._dedup_start_date
648
+ end_date = self._dedup_end_date
649
+ if start_date and end_date:
650
+ all_dates = [d for d in all_dates if str(start_date) <= str(d) <= str(end_date)]
651
+ if not all_dates:
652
+ logger.info('无可处理日期', {"库": database, "表": table})
653
+ return (0, 0)
654
+ total_dup = 0
655
+ total_del = 0
656
+ def process_date(date_val):
657
+ try:
658
+ logger.debug('按天分区去重', {"库": database, "表": table, "日期": date_val})
659
+ dup_count, affected_rows = self._deduplicate_table(
660
+ database, table, columns, dry_run, use_python_dedup,
661
+ date_val=date_val, lock_table=False
662
+ )
663
+ return (dup_count, affected_rows, date_val, None)
664
+ except Exception as e:
665
+ logger.error('分区去重异常', {"库": database, "表": table, "日期": date_val, "异常": str(e), "func": sys._getframe().f_code.co_name})
666
+ return (0, 0, date_val, str(e))
667
+ if self.max_workers > 1:
668
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
669
+ future_to_date = {executor.submit(process_date, date_val): date_val for date_val in all_dates}
670
+ for future in concurrent.futures.as_completed(future_to_date):
671
+ dup_count, affected_rows, date_val, err = future.result()
672
+ if err:
673
+ logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
674
+ total_dup += dup_count
675
+ total_del += affected_rows
676
+ else:
677
+ for date_val in all_dates:
678
+ dup_count, affected_rows, _, err = process_date(date_val)
679
+ if err:
680
+ logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
681
+ total_dup += dup_count
682
+ total_del += affected_rows
683
+ logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": (total_dup, total_del), '日期范围': f"{start_date} - {end_date}"})
684
+ # 自动重排id列(仅当有实际删除时且reorder_id为True)
685
+ if reorder_id and total_del > 0:
686
+ try:
687
+ reorder_ok = self.reorder_id_column(database, table, id_column=self.primary_key, dry_run=dry_run)
688
+ logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
689
+ except Exception as e:
690
+ logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
691
+ return (total_dup, total_del)
692
+ # 没有date_column,直接全表去重
693
+ result = self._deduplicate_table(database, table, columns, dry_run, use_python_dedup, date_val=None)
694
+ logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result, '日期范围': '全表'})
579
695
  dup_count, affected_rows = result
580
696
  if reorder_id and affected_rows > 0:
581
697
  try:
@@ -589,28 +705,28 @@ class MySQLDeduplicator:
589
705
  return (0, 0)
590
706
 
591
707
  def deduplicate_database(
592
- self,
593
- database: str,
594
- tables: Optional[List[str]] = None,
595
- columns_map: Optional[Dict[str, List[str]]] = None,
596
- dry_run: bool = False,
597
- parallel: bool = False,
598
- reorder_id: bool = False,
599
- use_python_dedup: bool = True
708
+ self,
709
+ database: str,
710
+ tables: Optional[List[str]] = None,
711
+ columns_map: Optional[Dict[str, List[str]]] = None,
712
+ dry_run: bool = False,
713
+ parallel: bool = False,
714
+ reorder_id: bool = False,
715
+ use_python_dedup: bool = True
600
716
  ) -> Dict[str, Tuple[int, int]]:
601
717
  """
602
- 对指定数据库的所有表进行去重。
718
+ 对指定数据库的所有表进行去重。调用 deduplicate_table,自动适配分天。
603
719
 
604
720
  Args:
605
721
  database (str): 数据库名。
606
- tables (Optional[List[str]]): 要处理的表列表(为None时处理所有表)。
607
- columns_map (Optional[Dict[str, List[str]]]): 各表使用的去重列 {表名: [列名]}。
722
+ tables (Optional[List[str]]): 指定表名列表。
723
+ columns_map (Optional[Dict[str, List[str]]]): 每个表的去重列映射。
608
724
  dry_run (bool): 是否为模拟运行。
609
- parallel (bool): 是否并行处理。
610
- reorder_id (bool): 去重后是否重排id
611
- use_python_dedup (bool): 是否用Python查找重复id。
725
+ parallel (bool): 是否并行处理表。
726
+ reorder_id (bool): 去重后是否自动重排 id 列。
727
+ use_python_dedup (bool): 是否用 Python 方式去重。
612
728
  Returns:
613
- Dict[str, Tuple[int, int]]: {表名: (重复组数, 实际删除行数)}
729
+ Dict[str, Tuple[int, int]]: {表名: (重复组数, 实际删除行数)}
614
730
  """
615
731
  results = {}
616
732
  try:
@@ -626,8 +742,6 @@ class MySQLDeduplicator:
626
742
  return results
627
743
  logger.info('库统计', {"库": database, "表数量": len(target_tables), "表列表": target_tables})
628
744
  if parallel and self.max_workers > 1:
629
- logger.debug('并行处理表', {'库': database, 'max_workers': self.max_workers})
630
- # 使用线程池并行处理
631
745
  with concurrent.futures.ThreadPoolExecutor(
632
746
  max_workers=self.max_workers
633
747
  ) as executor:
@@ -637,7 +751,7 @@ class MySQLDeduplicator:
637
751
  logger.debug('提交表去重任务', {'库': database, '表': table, 'columns': columns})
638
752
  futures[executor.submit(
639
753
  self.deduplicate_table,
640
- database, table, columns, dry_run, reorder_id, True
754
+ database, table, columns, dry_run, reorder_id, use_python_dedup
641
755
  )] = table
642
756
  for future in concurrent.futures.as_completed(futures):
643
757
  table = futures[future]
@@ -648,45 +762,43 @@ class MySQLDeduplicator:
648
762
  logger.error('异常', {"库": database, "表": table, "error": str(e), 'traceback': repr(e)})
649
763
  results[table] = (0, 0)
650
764
  else:
651
- logger.debug('串行处理表', {'库': database})
652
- # 串行处理
653
765
  for table in target_tables:
654
766
  columns = columns_map.get(table) if columns_map else None
655
767
  dup_count, affected_rows = self.deduplicate_table(
656
- database, table, columns, dry_run, reorder_id, True
768
+ database, table, columns, dry_run, reorder_id, use_python_dedup
657
769
  )
658
770
  results[table] = (dup_count, affected_rows)
659
771
  total_dup = sum(r[0] for r in results.values())
660
772
  total_del = sum(r[1] for r in results.values())
661
- logger.info('单库完成', {"库": database, "重复组数": total_dup, "总删除行数": total_del, "详细结果": results})
773
+ logger.info('库完成', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": results})
662
774
  return results
663
775
  except Exception as e:
664
776
  logger.error('发生全局错误', {"库": database, 'func': sys._getframe().f_code.co_name, "error": str(e), 'traceback': repr(e)})
665
777
  return results
666
778
 
667
779
  def deduplicate_all(
668
- self,
669
- databases: Optional[List[str]] = None,
670
- tables_map: Optional[Dict[str, List[str]]] = None,
671
- columns_map: Optional[Dict[str, Dict[str, List[str]]]] = None,
672
- dry_run: bool = False,
673
- parallel: bool = False,
674
- reorder_id: bool = False,
675
- use_python_dedup: bool = True
780
+ self,
781
+ databases: Optional[List[str]] = None,
782
+ tables_map: Optional[Dict[str, List[str]]] = None,
783
+ columns_map: Optional[Dict[str, Dict[str, List[str]]]] = None,
784
+ dry_run: bool = False,
785
+ parallel: bool = False,
786
+ reorder_id: bool = False,
787
+ use_python_dedup: bool = True
676
788
  ) -> Dict[str, Dict[str, Tuple[int, int]]]:
677
789
  """
678
- 对所有数据库进行去重。
790
+ 对所有数据库进行去重。调用 deduplicate_database,自动适配分天。
679
791
 
680
792
  Args:
681
- databases (Optional[List[str]]): 要处理的数据库列表。如果为 None,则处理所有非系统数据库。
682
- tables_map (Optional[Dict[str, List[str]]]): 指定每个数据库要处理的表,格式为 {数据库名: [表名, ...]}。如果为 None,则处理所有表。
683
- columns_map (Optional[Dict[str, Dict[str, List[str]]]]): 指定每个表去重时使用的列,格式为 {数据库名: {表名: [列名, ...]}}。如果为 None,则使用所有列。
684
- dry_run (bool): 是否为模拟运行模式。为 True 时只统计重复行数,不实际删除。
685
- parallel (bool): 是否并行处理多个数据库。为 True 时使用线程池并发处理。
686
- reorder_id (bool): 去重后是否重排id
687
- use_python_dedup (bool): 是否用Python查找重复id。
793
+ databases (Optional[List[str]]): 指定数据库名列表。
794
+ tables_map (Optional[Dict[str, List[str]]]): 每个库的表名映射。
795
+ columns_map (Optional[Dict[str, Dict[str, List[str]]]]): 每个库每个表的去重列映射。
796
+ dry_run (bool): 是否为模拟运行。
797
+ parallel (bool): 是否并行处理库。
798
+ reorder_id (bool): 去重后是否自动重排 id 列。
799
+ use_python_dedup (bool): 是否用 Python 方式去重。
688
800
  Returns:
689
- Dict[str, Dict[str, Tuple[int, int]]]: 嵌套字典,格式为 {数据库名: {表名: (重复组数, 实际删除行数)}}
801
+ Dict[str, Dict[str, Tuple[int, int]]]: {库: {表: (重复组数, 实际删除行数)}}
690
802
  """
691
803
  all_results: Dict[str, Dict[str, Tuple[int, int]]] = defaultdict(dict)
692
804
  try:
@@ -696,9 +808,18 @@ class MySQLDeduplicator:
696
808
  if not target_dbs:
697
809
  logger.warning('没有可处理的数据库')
698
810
  return all_results
699
- logger.info('全局开始', {"数据库数量": len(target_dbs), "数据库列表": target_dbs, "参数": {"模拟运行": dry_run, "并行处理": parallel, '排除列': self.exclude_columns, 'use_python_dedup': use_python_dedup}})
811
+ logger.info('全局开始', {
812
+ "数据库数量": len(target_dbs),
813
+ "数据库列表": target_dbs,
814
+ "参数": {
815
+ "模拟运行": dry_run,
816
+ "并行处理": parallel,
817
+ '排除列': self.exclude_columns,
818
+ '重排id': reorder_id,
819
+ 'use_python_dedup': use_python_dedup
820
+ },
821
+ })
700
822
  if parallel and self.max_workers > 1:
701
- # 使用线程池并行处理多个数据库
702
823
  with concurrent.futures.ThreadPoolExecutor(
703
824
  max_workers=self.max_workers
704
825
  ) as executor:
@@ -708,7 +829,7 @@ class MySQLDeduplicator:
708
829
  db_columns_map = columns_map.get(db) if columns_map else None
709
830
  futures[executor.submit(
710
831
  self.deduplicate_database,
711
- db, tables, db_columns_map, dry_run, False, reorder_id, True
832
+ db, tables, db_columns_map, dry_run, False, reorder_id, use_python_dedup
712
833
  )] = db
713
834
  for future in concurrent.futures.as_completed(futures):
714
835
  db = futures[future]
@@ -719,12 +840,11 @@ class MySQLDeduplicator:
719
840
  logger.error('异常', {"库": db, "error": str(e), 'traceback': repr(e)})
720
841
  all_results[db] = {}
721
842
  else:
722
- # 串行处理数据库
723
843
  for db in target_dbs:
724
844
  tables = tables_map.get(db) if tables_map else None
725
845
  db_columns_map = columns_map.get(db) if columns_map else None
726
846
  db_results = self.deduplicate_database(
727
- db, tables, db_columns_map, dry_run, parallel, reorder_id, True
847
+ db, tables, db_columns_map, dry_run, parallel, reorder_id, use_python_dedup
728
848
  )
729
849
  all_results[db] = db_results
730
850
  total_dup = sum(
@@ -735,7 +855,18 @@ class MySQLDeduplicator:
735
855
  r[1] for db in all_results.values()
736
856
  for r in db.values()
737
857
  )
738
- logger.info('全局完成', {"总重复组数": total_dup, "总删除行数": total_del, "详细结果": dict(all_results)})
858
+ logger.info('全局完成', {
859
+ "总重复组": total_dup,
860
+ "总删除行": total_del,
861
+ "参数": {
862
+ "模拟运行": dry_run,
863
+ "并行处理": parallel,
864
+ '排除列': self.exclude_columns,
865
+ '重排id': reorder_id,
866
+ 'use_python_dedup': use_python_dedup
867
+ },
868
+ "详细结果": dict(all_results)
869
+ })
739
870
  return all_results
740
871
  except Exception as e:
741
872
  logger.error('异常', {"error": str(e), 'traceback': repr(e)})
@@ -780,6 +911,31 @@ class MySQLDeduplicator:
780
911
  cursor.execute(sql, (database, table))
781
912
  return bool(cursor.fetchone())
782
913
 
914
+ @_retry_on_failure
915
+ def _get_table_info(self, database: str, table: str, id_column: str = None):
916
+ """
917
+ 获取表的所有列名、主键列名列表、指定id列是否为主键。
918
+ Args:
919
+ database (str): 数据库名。
920
+ table (str): 表名。
921
+ id_column (str): id列名,默认使用self.primary_key。
922
+ Returns:
923
+ Tuple[List[str], List[str], bool]: (所有列名, 主键列名, id列是否为主键)
924
+ """
925
+ id_column = id_column or self.primary_key
926
+ with self._get_connection() as conn:
927
+ with conn.cursor() as cursor:
928
+ cursor.execute("""
929
+ SELECT COLUMN_NAME, COLUMN_KEY
930
+ FROM INFORMATION_SCHEMA.COLUMNS
931
+ WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s
932
+ """, (database, table))
933
+ columns_info = cursor.fetchall()
934
+ columns = [row['COLUMN_NAME'] for row in columns_info]
935
+ pk_cols = [row['COLUMN_NAME'] for row in columns_info if row['COLUMN_KEY'] == 'PRI']
936
+ id_is_pk = any(row['COLUMN_NAME'].lower() == id_column.lower() and row['COLUMN_KEY'] in ('PRI', 'UNI') for row in columns_info)
937
+ return columns, pk_cols, id_is_pk
938
+
783
939
  def close(self) -> None:
784
940
  """
785
941
  关闭连接池。
@@ -828,15 +984,16 @@ class MySQLDeduplicator:
828
984
  auto_drop_backup: bool = True
829
985
  ) -> Any:
830
986
  """
831
- 安全重排指定表或指定库下所有表的id列为顺序自增(1,2,3...)。
987
+ 安全重排指定表或指定库下所有表的 id 列为顺序自增(1,2,3...)。
988
+
832
989
  Args:
833
- database (str): 数据库名
834
- table (Optional[str]): 表名,None时批量处理该库所有表
835
- id_column (str): id列名,默认"id"
836
- dry_run (bool): 是否为模拟运行
837
- auto_drop_backup (bool): 校验通过后自动删除备份表
990
+ database (str): 数据库名。
991
+ table (Optional[str]): 表名,None 时批量处理该库所有表。
992
+ id_column (str): id 列名,默认 "id"
993
+ dry_run (bool): 是否为模拟运行。
994
+ auto_drop_backup (bool): 校验通过后自动删除备份表。
838
995
  Returns:
839
- bool 或 dict: 单表时bool,批量时{表名: bool}
996
+ bool 或 dict: 单表时 bool,批量时 {表名: bool}
840
997
  """
841
998
  if not table:
842
999
  # 批量模式,对库下所有表执行
@@ -865,22 +1022,12 @@ class MySQLDeduplicator:
865
1022
  if not self._check_table_exists(database, table):
866
1023
  logger.warning('表不存在,跳过id重排', {"库": database, "表": table})
867
1024
  return False
868
- # 检查id列是否存在
869
- with self._get_connection() as conn:
870
- with conn.cursor() as cursor:
871
- cursor.execute("""
872
- SELECT COLUMN_NAME, COLUMN_KEY
873
- FROM INFORMATION_SCHEMA.COLUMNS
874
- WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s
875
- """, (database, table))
876
- columns_info = cursor.fetchall()
877
- columns = [row['COLUMN_NAME'] for row in columns_info]
878
- id_is_pk = any(row['COLUMN_NAME'].lower() == id_column.lower() and row['COLUMN_KEY'] in ('PRI', 'UNI') for row in columns_info)
1025
+ # 检查id列、主键信息(用_get_table_info)
1026
+ columns, pk_cols, id_is_pk = self._get_table_info(database, table, id_column)
879
1027
  if id_column not in columns:
880
1028
  logger.warning('表无id列,跳过id重排', {"库": database, "表": table})
881
1029
  return False
882
1030
  # 检查主键是否为单列id
883
- pk_cols = [row['COLUMN_NAME'] for row in columns_info if row['COLUMN_KEY'] == 'PRI']
884
1031
  if len(pk_cols) != 1 or pk_cols[0].lower() != id_column.lower():
885
1032
  logger.warning('主键不是单列id,跳过id重排', {"库": database, "表": table, "主键列": pk_cols})
886
1033
  return False
@@ -903,9 +1050,9 @@ class MySQLDeduplicator:
903
1050
  if dry_run:
904
1051
  logger.info('dry_run模式,打印原表结构', {"库": database, "表": table, "建表语句": create_table_sql})
905
1052
  return True
906
- temp_table = self._make_safe_table_name(table, prefix=f"temp_", suffix=f"_reorderid_{os.getpid()}_{threading.get_ident()}")
1053
+ temp_table = self._make_temp_table_name(table)
907
1054
  temp_table_quoted = f"`{database}`.`{temp_table}`"
908
- backup_table = self._make_safe_table_name(table, prefix="backup_", suffix=f"_{int(time.time())}_{uuid.uuid4().hex[:8]}")
1055
+ backup_table = self._make_backup_table_name(table)
909
1056
  backup_table_quoted = f"`{database}`.`{backup_table}`"
910
1057
  try:
911
1058
  with self._get_connection() as conn:
@@ -1000,6 +1147,41 @@ class MySQLDeduplicator:
1000
1147
  finally:
1001
1148
  self._release_table_lock(database, table)
1002
1149
 
1150
+ def _acquire_table_lock(self, database: str, table: str, timeout: int = 60) -> bool:
1151
+ """
1152
+ 获取表级锁,防止多线程/多进程并发操作同一张表。
1153
+ Args:
1154
+ database (str): 数据库名。
1155
+ table (str): 表名。
1156
+ timeout (int): 等待锁的超时时间(秒)。
1157
+ Returns:
1158
+ bool: 是否成功获取锁。
1159
+ """
1160
+ key = f"{database.lower()}::{table.lower()}"
1161
+ start_time = time.time()
1162
+ while True:
1163
+ with self._lock:
1164
+ if key not in self._processing_tables:
1165
+ self._processing_tables.add(key)
1166
+ return True
1167
+ if time.time() - start_time > timeout:
1168
+ logger.warning('获取表级锁超时', {"库": database, "表": table, "timeout": timeout})
1169
+ return False
1170
+ time.sleep(0.2)
1171
+
1172
+ def _release_table_lock(self, database: str, table: str) -> None:
1173
+ """
1174
+ 释放表级锁。
1175
+ Args:
1176
+ database (str): 数据库名。
1177
+ table (str): 表名。
1178
+ Returns:
1179
+ None
1180
+ """
1181
+ key = f"{database.lower()}::{table.lower()}"
1182
+ with self._lock:
1183
+ self._processing_tables.discard(key)
1184
+
1003
1185
  @staticmethod
1004
1186
  def _make_safe_table_name(base: str, prefix: str = '', suffix: str = '', max_length: int = 64) -> str:
1005
1187
  """
@@ -1020,6 +1202,20 @@ class MySQLDeduplicator:
1020
1202
  return (prefix + suffix)[:max_length]
1021
1203
  return f"{prefix}{base[:remain]}{suffix}"[:max_length]
1022
1204
 
1205
+ def _make_temp_table_name(self, base: str) -> str:
1206
+ """
1207
+ 生成临时表名,带有 temp_ 前缀和 _dedup_ 进程线程后缀。
1208
+ """
1209
+ suffix = f"_dedup_{os.getpid()}_{threading.get_ident()}"
1210
+ return self._make_safe_table_name(base, prefix="temp_", suffix=suffix)
1211
+
1212
+ def _make_backup_table_name(self, base: str) -> str:
1213
+ """
1214
+ 生成备份表名,带有 backup_ 前缀和时间戳+uuid后缀。
1215
+ """
1216
+ suffix = f"_{int(time.time())}_{uuid.uuid4().hex[:8]}"
1217
+ return self._make_safe_table_name(base, prefix="backup_", suffix=suffix)
1218
+
1023
1219
 
1024
1220
  def main():
1025
1221
  deduplicator = MySQLDeduplicator(
@@ -1027,18 +1223,42 @@ def main():
1027
1223
  password='pwd',
1028
1224
  host='localhost',
1029
1225
  port=3306,
1030
- date_range=['2025-05-27', '2025-05-28'],
1031
- exclude_tables={'推广数据2': ['地域报表_城市_2025_05_copy1', '主体报表_2025_copy1']}
1226
+ max_workers= 2,
1227
+ batch_size=1000,
1228
+ skip_system_dbs=True,
1229
+ max_retries=3,
1230
+ retry_interval=5,
1231
+ pool_size=5,
1232
+ recent_month=1,
1233
+ # date_range=['2025-06-09', '2025-06-10'],
1234
+ date_column='日期',
1235
+ exclude_columns=None,
1236
+ exclude_databases=['测试库4'],
1237
+ exclude_tables={
1238
+ '推广数据2': [
1239
+ '地域报表_城市_2025_04',
1240
+ '地域报表_城市_2025_05',
1241
+ '地域报表_城市_2025_06',
1242
+ # '地域报表_城市_2025_04_copy1',
1243
+ # '地域报表_城市_2025_05_copy1',
1244
+ # '地域报表_城市_2025_06_copy1',
1245
+ '奥莱店_主体报表',
1246
+ # '奥莱店_主体报表_copy1',
1247
+ ],
1248
+ "生意参谋3": [
1249
+ "商品排行_2025",
1250
+ ],
1251
+ },
1032
1252
  )
1033
1253
 
1034
1254
  # 全库去重(单线程)
1035
1255
  deduplicator.deduplicate_all(dry_run=False, parallel=True, reorder_id=True)
1036
1256
 
1037
1257
  # # 指定数据库去重(多线程)
1038
- # deduplicator.deduplicate_database('my_db', dry_run=False, parallel=False, reorder_id=False)
1258
+ # deduplicator.deduplicate_database('推广数据2', dry_run=False, parallel=True, reorder_id=True)
1039
1259
 
1040
1260
  # # 指定表去重(使用特定列)
1041
- # deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False, reorder_id=False)
1261
+ # deduplicator.deduplicate_table('推广数据2', '地域报表_城市_2025_06_copy1', columns=[], dry_run=False, reorder_id=True)
1042
1262
 
1043
1263
  # # 重排id列
1044
1264
  # deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
@@ -1047,5 +1267,5 @@ def main():
1047
1267
  deduplicator.close()
1048
1268
 
1049
1269
  if __name__ == '__main__':
1050
- main()
1270
+ # main()
1051
1271
  pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 3.11.9
3
+ Version: 3.11.11
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,5 +1,5 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=PDdrWyCY8MR3t82c_RzSF6lAB6oCcZdWveXkX7AvIIQ,18
2
+ mdbq/__version__.py,sha256=GrY3av2BYeEaosI2qWYizQyTwyijdq8IuOuFjTJqLxE,19
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
4
  mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
5
5
  mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
@@ -8,7 +8,7 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
8
8
  mdbq/log/mylogger.py,sha256=Crw6LwVo3I3IUbzIETu8f46Quza3CTCh-qYf4edbBPo,24139
9
9
  mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
10
10
  mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
11
- mdbq/mysql/deduplicator.py,sha256=G7hdIO6rDLBNo1jSm6PbmPAzzfdN2jZFP4BnLhO02Mo,52970
11
+ mdbq/mysql/deduplicator.py,sha256=e84MLhWjdCoDB8GxUV-z5drn8hdKGlJKnHzNW0rjIM8,65345
12
12
  mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
13
13
  mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
14
14
  mdbq/mysql/uploader.py,sha256=8Px_W2bYOr1wQgMXMK0DggNiuE6a6Ul4BlJake8LSo8,64469
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
24
24
  mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
25
25
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
26
26
  mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
27
- mdbq-3.11.9.dist-info/METADATA,sha256=djSbJHNSHuyh2So6ia5CluTggpZ4REj9jxhO9vwOeKw,364
28
- mdbq-3.11.9.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
- mdbq-3.11.9.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
- mdbq-3.11.9.dist-info/RECORD,,
27
+ mdbq-3.11.11.dist-info/METADATA,sha256=NHTu8tsBwtvh90jaiNN4E4i9SW5xkH6P-yYcBrxwSbU,365
28
+ mdbq-3.11.11.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
+ mdbq-3.11.11.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
+ mdbq-3.11.11.dist-info/RECORD,,
File without changes