mdbq 3.10.9__py3-none-any.whl → 3.10.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.10.9'
1
+ VERSION = '3.10.10'
@@ -1,5 +1,4 @@
1
1
  # -*- coding:utf-8 -*-
2
- import datetime
3
2
  import re
4
3
  import time
5
4
  from functools import wraps
@@ -7,11 +6,12 @@ import warnings
7
6
  import pymysql
8
7
  import os
9
8
  from mdbq.log import mylogger
10
- from typing import List, Dict, Optional, Any, Tuple, Set
9
+ from typing import List, Dict, Optional, Any, Tuple
11
10
  from dbutils.pooled_db import PooledDB
12
11
  import threading
13
12
  import concurrent.futures
14
13
  from collections import defaultdict
14
+ import sys
15
15
 
16
16
 
17
17
  warnings.filterwarnings('ignore')
@@ -73,27 +73,27 @@ class MySQLDeduplicator:
73
73
  max_retries: int = 3,
74
74
  retry_interval: int = 5,
75
75
  pool_size: int = 5,
76
- primary_key: str = 'id'
76
+ primary_key: str = 'id',
77
+ date_range: Optional[List[str]] = None,
78
+ recent_month: Optional[int] = None,
79
+ date_column: str = '日期',
80
+ exclude_columns: Optional[List[str]] = None
77
81
  ) -> None:
78
82
  """
79
83
  初始化去重处理器
80
-
81
- :param username: 数据库用户名
82
- :param password: 数据库密码
83
- :param host: 数据库主机,默认为localhost
84
- :param port: 数据库端口,默认为3306
85
- :param charset: 字符集,默认为utf8mb4
86
- :param max_workers: 最大工作线程数,默认为1(单线程)
87
- :param batch_size: 批量处理大小,默认为1000
88
- :param skip_system_dbs: 是否跳过系统数据库,默认为True
89
- :param max_retries: 最大重试次数
90
- :param retry_interval: 重试间隔(秒)
91
- :param pool_size: 连接池大小
92
- :param primary_key: 主键列名,默认为'id'
84
+ 新增参数:
85
+ :param date_range: 指定去重的日期区间 [start_date, end_date],格式'YYYY-MM-DD'
86
+ :param recent_month: 最近N个月的数据去重(与date_range互斥,优先生效)
87
+ :param date_column: 时间列名,默认为'日期'
88
+ :param exclude_columns: 去重时排除的列名列表,默认为['id', '更新时间']
93
89
  """
94
90
  # 连接池状态标志
95
91
  self._closed = False
96
-
92
+ logger.debug('初始化MySQLDeduplicator', {
93
+ 'host': host, 'port': port, 'user': username, 'charset': charset,
94
+ 'max_workers': max_workers, 'batch_size': batch_size, 'pool_size': pool_size,
95
+ 'exclude_columns': exclude_columns
96
+ })
97
97
  # 初始化连接池
98
98
  self.pool = PooledDB(
99
99
  creator=pymysql,
@@ -114,6 +114,33 @@ class MySQLDeduplicator:
114
114
  self.retry_interval = retry_interval
115
115
  self.primary_key = primary_key
116
116
 
117
+ # 时间范围参数
118
+ self.date_range = date_range
119
+ self.recent_month = recent_month
120
+ self.date_column = date_column
121
+ self._dedup_start_date = None
122
+ self._dedup_end_date = None
123
+ # 不管 exclude_columns 是否传入, 'id' 一定会被排除
124
+ default_exclude = {'id'}
125
+ # exclude_columns 不传则排除: ['id', '更新时间']
126
+ if not exclude_columns:
127
+ self.exclude_columns = list(default_exclude | {'更新时间'})
128
+ else:
129
+ self.exclude_columns = list(set(exclude_columns) | default_exclude)
130
+ # 解析时间范围
131
+ if self.date_range and len(self.date_range) == 2:
132
+ self._dedup_start_date, self._dedup_end_date = self.date_range
133
+ elif self.recent_month:
134
+ from datetime import datetime, timedelta
135
+ today = datetime.today()
136
+ month = today.month - self.recent_month
137
+ year = today.year
138
+ while month <= 0:
139
+ month += 12
140
+ year -= 1
141
+ self._dedup_start_date = f"{year}-{month:02d}-01"
142
+ self._dedup_end_date = today.strftime("%Y-%m-%d")
143
+
117
144
  # 线程安全控制
118
145
  self._lock = threading.Lock()
119
146
  self._processing_tables = set() # 正在处理的表集合
@@ -124,24 +151,25 @@ class MySQLDeduplicator:
124
151
  def _get_connection(self) -> pymysql.connections.Connection:
125
152
  """从连接池获取连接"""
126
153
  if self._closed:
154
+ logger.error('尝试获取连接但连接池已关闭')
127
155
  raise ConnectionError("连接池已关闭")
128
156
  try:
129
157
  conn = self.pool.connection()
130
158
  logger.debug("成功获取数据库连接")
131
159
  return conn
132
160
  except Exception as e:
133
- logger.error(f"获取数据库连接失败: {str(e)}")
161
+ logger.error(f"获取数据库连接失败: {str(e)}", {'error_type': type(e).__name__})
134
162
  raise ConnectionError(f"连接数据库失败: {str(e)}")
135
163
 
136
164
  @staticmethod
137
165
  def _retry_on_failure(func: Any) -> Any:
138
166
  """重试装饰器"""
139
-
140
167
  @wraps(func)
141
168
  def wrapper(self, *args, **kwargs):
142
169
  last_exception = None
143
170
  for attempt in range(self.max_retries + 1):
144
171
  try:
172
+ logger.debug(f'调用{func.__name__},第{attempt+1}次尝试', {'args': args, 'kwargs': kwargs})
145
173
  return func(self, *args, **kwargs)
146
174
  except (pymysql.OperationalError, pymysql.InterfaceError) as e:
147
175
  last_exception = e
@@ -149,18 +177,17 @@ class MySQLDeduplicator:
149
177
  wait_time = self.retry_interval * (attempt + 1)
150
178
  logger.warning(
151
179
  f"数据库操作失败,准备重试 (尝试 {attempt + 1}/{self.max_retries})",
152
- {'error': str(e), 'wait_time': wait_time})
180
+ {'error': str(e), 'wait_time': wait_time, 'func': func.__name__})
153
181
  time.sleep(wait_time)
154
182
  continue
155
183
  except Exception as e:
156
184
  last_exception = e
157
- logger.error(f"操作失败: {str(e)}", {'error_type': type(e).__name__})
185
+ logger.error(f"操作失败: {str(e)}", {'error_type': type(e).__name__, 'func': func.__name__})
158
186
  break
159
-
160
187
  if last_exception:
188
+ logger.error('重试后依然失败', {'func': func.__name__, 'last_exception': str(last_exception)})
161
189
  raise last_exception
162
190
  raise Exception("未知错误")
163
-
164
191
  return wrapper
165
192
 
166
193
  @_retry_on_failure
@@ -241,104 +268,111 @@ class MySQLDeduplicator:
241
268
  """
242
269
  if not self._acquire_table_lock(database, table):
243
270
  return (0, 0)
244
-
271
+ temp_table = None
245
272
  try:
246
- logger.info(f"开始处理表: {database}.{table}")
247
-
273
+ # 获取原始数据总量
274
+ with self._get_connection() as conn:
275
+ with conn.cursor() as cursor:
276
+ logger.debug('执行SQL', {'sql': f'SELECT COUNT(*) as cnt FROM `{database}`.`{table}`'})
277
+ cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`")
278
+ total_count_row = cursor.fetchone()
279
+ total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
280
+ logger.info('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name})
248
281
  # 获取实际列名
249
282
  all_columns = self._get_table_columns(database, table)
250
- if not all_columns:
251
- logger.warning(f"表 {database}.{table} 没有有效列(可能只有id列),跳过")
252
- return (0, 0)
253
-
254
- # 使用指定列或所有列
283
+ logger.debug('获取表列', {'库': database, '表': table, 'all_columns': all_columns})
284
+ # 检查是否需要按时间范围过滤
285
+ use_time_filter = False
286
+ time_col = self.date_column
287
+ all_columns_lower = [col.lower() for col in all_columns]
288
+ # 排除exclude_columns
289
+ exclude_columns_lower = [col.lower() for col in getattr(self, 'exclude_columns', [])]
290
+ # 统一列名小写做判断
255
291
  use_columns = columns or all_columns
256
- invalid_columns = set(use_columns) - set(all_columns)
257
-
292
+ use_columns = [col for col in use_columns if col.lower() in all_columns_lower and col.lower() not in exclude_columns_lower]
293
+ invalid_columns = set([col for col in (columns or []) if col.lower() not in all_columns_lower])
258
294
  if invalid_columns:
259
- logger.warning(
260
- f"表 {database}.{table} 中不存在以下列: {invalid_columns},使用有效列",
261
- {'invalid_columns': list(invalid_columns)}
262
- )
263
- use_columns = [col for col in use_columns if col in all_columns]
264
-
295
+ logger.warning('不存在的列', {"库": database, "表": table, "不存在以下列": invalid_columns, 'func': sys._getframe().f_code.co_name})
265
296
  if not use_columns:
266
- logger.error(f"表 {database}.{table} 没有有效的去重列")
297
+ logger.error('没有有效的去重列', {"库": database, "": table})
267
298
  return (0, 0)
268
-
269
- # 构建去重SQL
299
+ # 统一用反引号包裹
270
300
  column_list = ', '.join([f'`{col}`' for col in use_columns])
271
- # 临时表名限制64字符以内
272
- temp_table = f"temp_{table}_dedup_{os.getpid()}"
301
+ temp_table = f"temp_{table}_dedup_{os.getpid()}_{threading.get_ident()}"
273
302
  temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)[:60]
274
303
  pk = self.primary_key
275
- # 校验主键列是否存在
276
- if pk not in all_columns and pk != 'id':
277
- logger.error(f"表 {database}.{table} 不存在主键列 {pk}")
304
+ # 主键判断也用小写
305
+ if pk.lower() not in all_columns_lower and pk != 'id':
306
+ logger.error('', {"不存在主键列": database, "": table, "主键列不存在": pk})
278
307
  return (0, 0)
308
+ # 找到实际主键名
309
+ pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
310
+ # 构造where条件
311
+ where_time = ''
312
+ if use_time_filter:
313
+ where_time = f"WHERE `{time_col}` >= '{self._dedup_start_date}' AND `{time_col}` <= '{self._dedup_end_date}'"
279
314
  create_temp_sql = f"""
280
315
  CREATE TABLE `{database}`.`{temp_table}` AS
281
- SELECT MIN(`{pk}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
316
+ SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
282
317
  FROM `{database}`.`{table}`
318
+ {where_time}
283
319
  GROUP BY {column_list}
284
320
  HAVING COUNT(*) > 1
285
321
  """
286
-
287
- delete_dup_sql = f"""
288
- DELETE FROM `{database}`.`{table}`
289
- WHERE `{pk}` NOT IN (
290
- SELECT `min_id` FROM `{database}`.`{temp_table}`
291
- ) AND ({' OR '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
292
- """
293
-
294
322
  drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
295
-
296
323
  with self._get_connection() as conn:
297
324
  with conn.cursor() as cursor:
298
- # 创建临时表统计重复数据
325
+ logger.debug('创建临时表SQL', {'sql': create_temp_sql})
299
326
  cursor.execute(create_temp_sql)
327
+ logger.debug('统计临时表重复组SQL', {'sql': f'SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`'})
300
328
  cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`")
301
329
  dup_count_row = cursor.fetchone()
302
330
  dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
303
-
304
331
  if dup_count == 0:
305
- logger.info(f"表 {database}.{table} 没有重复数据")
332
+ logger.info('没有重复数据', {"库": database, "": table, "数据量": total_count, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns})
333
+ logger.debug('删除临时表SQL', {'sql': drop_temp_sql})
306
334
  cursor.execute(drop_temp_sql)
307
335
  conn.commit()
308
336
  return (0, 0)
309
-
310
- logger.info(
311
- f"表 {database}.{table} 发现 {dup_count} 组重复数据",
312
- {'columns': use_columns}
313
- )
314
-
337
+ affected_rows = 0
315
338
  if not dry_run:
316
- # 执行实际删除
317
- cursor.execute(delete_dup_sql)
318
- affected_rows = cursor.rowcount
319
- conn.commit()
320
- logger.info(
321
- f"表 {database}.{table} 已删除 {affected_rows} 行重复数据",
322
- {'columns': use_columns}
323
- )
339
+ # 分批删除,避免锁表
340
+ while True:
341
+ delete_dup_sql = f"""
342
+ DELETE FROM `{database}`.`{table}`
343
+ WHERE `{pk_real}` NOT IN (
344
+ SELECT `min_id` FROM `{database}`.`{temp_table}`
345
+ ) {'AND' if use_time_filter else ''} {f'`{time_col}` >= \'{self._dedup_start_date}\' AND `{time_col}` <= \'{self._dedup_end_date}\'' if use_time_filter else ''}
346
+ AND ({' AND '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
347
+ LIMIT {self.batch_size}
348
+ """
349
+ logger.debug('执行删除重复数据SQL', {'sql': delete_dup_sql})
350
+ cursor.execute(delete_dup_sql)
351
+ batch_deleted = cursor.rowcount
352
+ affected_rows += batch_deleted
353
+ conn.commit()
354
+ if batch_deleted < self.batch_size:
355
+ break
356
+ logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns})
324
357
  else:
358
+ logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组数": dup_count, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None})
325
359
  affected_rows = 0
326
- logger.info(
327
- f"[模拟运行] 表 {database}.{table} 将删除 {dup_count} 组重复数据",
328
- {'columns': use_columns}
329
- )
330
-
331
- # 清理临时表
360
+ logger.debug('删除临时表SQL', {'sql': drop_temp_sql})
332
361
  cursor.execute(drop_temp_sql)
333
362
  conn.commit()
334
-
335
363
  return (dup_count, affected_rows)
336
-
337
364
  except Exception as e:
338
- logger.error(
339
- f"处理表 {database}.{table} 时出错: {str(e)}",
340
- {'error_type': type(e).__name__}
341
- )
365
+ logger.error('异常', {"库": database, "表": table, "异常": str(e), 'func': sys._getframe().f_code.co_name, 'traceback': repr(e)})
366
+ # 异常时也要清理临时表
367
+ if temp_table:
368
+ try:
369
+ with self._get_connection() as conn:
370
+ with conn.cursor() as cursor:
371
+ drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
372
+ cursor.execute(drop_temp_sql)
373
+ conn.commit()
374
+ except Exception as drop_e:
375
+ logger.error('异常时清理临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
342
376
  return (0, 0)
343
377
  finally:
344
378
  self._release_table_lock(database, table)
@@ -360,17 +394,15 @@ class MySQLDeduplicator:
360
394
  :return: (重复行数, 删除行数)
361
395
  """
362
396
  try:
363
- # 检查表是否存在
364
397
  if not self._check_table_exists(database, table):
365
- logger.warning(f"表 {database}.{table} 不存在,跳过")
398
+ logger.warning('表不存在', {"库": database, "": table, "warning": "跳过"})
366
399
  return (0, 0)
367
-
368
- return self._deduplicate_table(database, table, columns, dry_run)
400
+ logger.info('单表开始', {"库": database, "表": table, "参数": {"指定去重列": columns, "模拟运行": dry_run, '排除列': self.exclude_columns}})
401
+ result = self._deduplicate_table(database, table, columns, dry_run)
402
+ logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result})
403
+ return result
369
404
  except Exception as e:
370
- logger.error(
371
- f"处理表 {database}.{table} 时发生全局错误: {str(e)}",
372
- {'error_type': type(e).__name__}
373
- )
405
+ logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
374
406
  return (0, 0)
375
407
 
376
408
  def deduplicate_database(
@@ -392,49 +424,40 @@ class MySQLDeduplicator:
392
424
  :return: 字典 {表名: (重复行数, 删除行数)}
393
425
  """
394
426
  results = {}
395
-
396
427
  try:
397
- # 检查数据库是否存在
398
428
  if not self._check_database_exists(database):
399
- logger.warning(f"数据库 {database} 不存在,跳过")
429
+ logger.warning('数据库不存在', {"库": database})
400
430
  return results
401
-
402
- # 获取要处理的表
403
431
  target_tables = tables or self._get_tables(database)
432
+ logger.debug('获取目标表', {'库': database, 'tables': target_tables})
404
433
  if not target_tables:
405
- logger.info(f"数据库 {database} 中没有表,跳过")
434
+ logger.info('数据库中没有表', {"库": database, "操作": "跳过"})
406
435
  return results
407
-
408
- logger.info(
409
- f"开始处理数据库 {database} 中的 {len(target_tables)} 张表",
410
- {'tables': target_tables}
411
- )
412
-
436
+ logger.info('库统计', {"库": database, "表数量": len(target_tables), "表列表": target_tables})
413
437
  if parallel and self.max_workers > 1:
414
- # 并行处理
438
+ logger.debug('并行处理表', {'库': database, 'max_workers': self.max_workers})
439
+ # 使用线程池并行处理
415
440
  with concurrent.futures.ThreadPoolExecutor(
416
441
  max_workers=self.max_workers
417
442
  ) as executor:
418
443
  futures = {}
419
444
  for table in target_tables:
420
445
  columns = columns_map.get(table) if columns_map else None
446
+ logger.debug('提交表去重任务', {'库': database, '表': table, 'columns': columns})
421
447
  futures[executor.submit(
422
448
  self.deduplicate_table,
423
449
  database, table, columns, dry_run
424
450
  )] = table
425
-
426
451
  for future in concurrent.futures.as_completed(futures):
427
452
  table = futures[future]
428
453
  try:
429
454
  dup_count, affected_rows = future.result()
430
455
  results[table] = (dup_count, affected_rows)
431
456
  except Exception as e:
432
- logger.error(
433
- f"处理表 {database}.{table} 时出错: {str(e)}",
434
- {'error_type': type(e).__name__}
435
- )
457
+ logger.error('异常', {"库": database, "表": table, "error": str(e), 'traceback': repr(e)})
436
458
  results[table] = (0, 0)
437
459
  else:
460
+ logger.debug('串行处理表', {'库': database})
438
461
  # 串行处理
439
462
  for table in target_tables:
440
463
  columns = columns_map.get(table) if columns_map else None
@@ -442,20 +465,12 @@ class MySQLDeduplicator:
442
465
  database, table, columns, dry_run
443
466
  )
444
467
  results[table] = (dup_count, affected_rows)
445
-
446
- # 统计结果
447
468
  total_dup = sum(r[0] for r in results.values())
448
469
  total_del = sum(r[1] for r in results.values())
449
-
450
- logger.info(
451
- f"数据库 {database} 处理完成 - 共发现 {total_dup} 组重复数据,删除 {total_del} 行",
452
- {'results': results}
453
- )
454
-
470
+ logger.info('单库完成', {"库": database, "重复组数": total_dup, "总删除行数": total_del, "详细结果": results})
455
471
  return results
456
-
457
472
  except Exception as e:
458
- logger.error(f"处理数据库 {database} 时发生全局错误: {str(e)}", {'error_type': type(e).__name__})
473
+ logger.error('发生全局错误', {"库": database, 'func': sys._getframe().f_code.co_name, "error": str(e), 'traceback': repr(e)})
459
474
  return results
460
475
 
461
476
  def deduplicate_all(
@@ -477,18 +492,15 @@ class MySQLDeduplicator:
477
492
  :return: 嵌套字典 {数据库名: {表名: (重复行数, 删除行数)}}
478
493
  """
479
494
  all_results = defaultdict(dict)
480
-
481
495
  try:
482
- # 获取要处理的数据库
483
496
  target_dbs = databases or self._get_databases()
497
+ logger.debug('获取目标数据库', {'databases': target_dbs})
484
498
  if not target_dbs:
485
- logger.warning("没有可处理的数据库")
499
+ logger.warning('没有可处理的数据库')
486
500
  return all_results
487
-
488
- logger.info(f"开始处理 {len(target_dbs)} 个数据库", {'databases': target_dbs})
489
-
501
+ logger.info('全局开始', {"数据库数量": len(target_dbs), "数据库列表": target_dbs, "参数": {"模拟运行": dry_run, "并行处理": parallel, '排除列': self.exclude_columns}})
490
502
  if parallel and self.max_workers > 1:
491
- # 并行处理数据库
503
+ # 使用线程池并行处理多个数据库
492
504
  with concurrent.futures.ThreadPoolExecutor(
493
505
  max_workers=self.max_workers
494
506
  ) as executor:
@@ -500,14 +512,13 @@ class MySQLDeduplicator:
500
512
  self.deduplicate_database,
501
513
  db, tables, db_columns_map, dry_run, False
502
514
  )] = db
503
-
504
515
  for future in concurrent.futures.as_completed(futures):
505
516
  db = futures[future]
506
517
  try:
507
518
  db_results = future.result()
508
519
  all_results[db] = db_results
509
520
  except Exception as e:
510
- logger.error(f"处理数据库 {db} 时出错: {str(e)}", {'error_type': type(e).__name__})
521
+ logger.error('异常', {"库": db, "error": str(e), 'traceback': repr(e)})
511
522
  all_results[db] = {}
512
523
  else:
513
524
  # 串行处理数据库
@@ -518,8 +529,6 @@ class MySQLDeduplicator:
518
529
  db, tables, db_columns_map, dry_run, parallel
519
530
  )
520
531
  all_results[db] = db_results
521
-
522
- # 统计总体结果
523
532
  total_dup = sum(
524
533
  r[0] for db in all_results.values()
525
534
  for r in db.values()
@@ -528,16 +537,10 @@ class MySQLDeduplicator:
528
537
  r[1] for db in all_results.values()
529
538
  for r in db.values()
530
539
  )
531
-
532
- logger.info(
533
- f"所有数据库处理完成 - 共发现 {total_dup} 组重复数据,删除 {total_del} 行",
534
- {'total_results': all_results}
535
- )
536
-
540
+ logger.info('全局完成', {"总重复组数": total_dup, "总删除行数": total_del, "详细结果": dict(all_results)})
537
541
  return all_results
538
-
539
542
  except Exception as e:
540
- logger.error(f"全局处理时发生错误: {str(e)}", {'error_type': type(e).__name__})
543
+ logger.error('异常', {"error": str(e), 'traceback': repr(e)})
541
544
  return all_results
542
545
 
543
546
  @_retry_on_failure
@@ -571,8 +574,10 @@ class MySQLDeduplicator:
571
574
  self.pool.close()
572
575
  self._closed = True
573
576
  logger.info("数据库连接池已关闭")
577
+ else:
578
+ logger.info('连接池已关闭或不存在')
574
579
  except Exception as e:
575
- logger.error(f"关闭连接池时出错: {str(e)}", {'error_type': type(e).__name__})
580
+ logger.error(f"关闭连接池时出错", {'error_type': type(e).__name__, 'error': str(e)})
576
581
 
577
582
  def __enter__(self) -> 'MySQLDeduplicator':
578
583
  return self
@@ -590,13 +595,14 @@ def main():
590
595
  )
591
596
 
592
597
  # 全库去重(单线程)
593
- deduplicator.deduplicate_all()
598
+ deduplicator.deduplicate_all(dry_run=False, parallel=False)
594
599
 
595
600
  # # 指定数据库去重(多线程)
596
- # deduplicator.deduplicate_database('my_db', parallel=True)
601
+ # logger.info('调用deduplicate_database')
602
+ # deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True)
597
603
 
598
604
  # # 指定表去重(使用特定列)
599
- # deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
605
+ # deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False)
600
606
 
601
607
  # 关闭连接
602
608
  deduplicator.close()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 3.10.9
3
+ Version: 3.10.10
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,5 +1,5 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=HNkSJG3z_6qz_o8xClGa5PA9tG04IvFuLgr3lvdcfKM,18
2
+ mdbq/__version__.py,sha256=mvvcpn_eYjsZWNgQAvfJdawR8GlNJmr51SxpSdq4Ekc,19
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
4
  mdbq/aggregation/optimize.py,sha256=zC_w_aVYXwmvfF0Z8iSGMmv5vptF0rP-Dz5zmp0gXTU,19820
5
5
  mdbq/aggregation/query_data.py,sha256=fdotW8qdAyDB13p7r3p6AGBkavcHnf6hIvSMtcS7vqE,179875
@@ -9,7 +9,7 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
9
9
  mdbq/log/mylogger.py,sha256=07sstIeaIQUJXwpMwmxppRI7kW7QwZFnv4Rr3UDlyUs,24133
10
10
  mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
11
11
  mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
12
- mdbq/mysql/deduplicator.py,sha256=Ingkpaz7Wy6qHxDP4aiy_2c76dSyuIwPF8_pb5dYC48,22542
12
+ mdbq/mysql/deduplicator.py,sha256=sm99eneNO7Br21BH-8vnZW3b7jA3gPF7c9Bvz04YV_g,27759
13
13
  mdbq/mysql/mysql.py,sha256=Lfy9PsEdgmdRtcG_UUgegH3bFTJPhByTWkcAYl8G6m0,56788
14
14
  mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
15
15
  mdbq/mysql/uploader.py,sha256=3RzslC10pNIYm-0NASicvCHXH0zgUXx7uD1jE21z_OU,64677
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
25
25
  mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
26
26
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
27
27
  mdbq/spider/aikucun.py,sha256=YyPWa_nOH1zs8wgTDcgzn5w8szGKWPyWzmWMVIPkFnU,21638
28
- mdbq-3.10.9.dist-info/METADATA,sha256=ur4y78xMogVypcNrUAkxFs7JSyZTQPTMpX5YC1wzhCA,364
29
- mdbq-3.10.9.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
30
- mdbq-3.10.9.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
31
- mdbq-3.10.9.dist-info/RECORD,,
28
+ mdbq-3.10.10.dist-info/METADATA,sha256=zIHTb2D1u7ZjwGM-zGMhGJbOTybYgzB30yjPCRBdW5w,365
29
+ mdbq-3.10.10.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
30
+ mdbq-3.10.10.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
31
+ mdbq-3.10.10.dist-info/RECORD,,
File without changes