mdbq 3.11.11__py3-none-any.whl → 3.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.11.11'
1
+ VERSION = '3.12.0'
@@ -14,6 +14,7 @@ from collections import defaultdict
14
14
  import sys
15
15
  from datetime import datetime
16
16
  import uuid
17
+ from contextlib import contextmanager
17
18
 
18
19
 
19
20
  warnings.filterwarnings('ignore')
@@ -47,8 +48,8 @@ class MySQLDeduplicator:
47
48
  batch_size: int = 1000,
48
49
  skip_system_dbs: bool = True,
49
50
  max_retries: int = 3,
50
- retry_interval: int = 5,
51
- pool_size: int = 5,
51
+ retry_waiting_time: int = 5,
52
+ pool_size: int = 10,
52
53
  primary_key: str = 'id',
53
54
  date_range: Optional[List[str]] = None,
54
55
  recent_month: Optional[int] = None,
@@ -87,15 +88,30 @@ class MySQLDeduplicator:
87
88
  cursorclass=pymysql.cursors.DictCursor
88
89
  )
89
90
 
91
+ # 并发模式要将 pool_size 加大
92
+ MAX_POOL_SIZE = 200
93
+ MAX_WORKERS = 4
94
+ if max_workers > MAX_WORKERS:
95
+ logger.warning(f"max_workers({max_workers}) 超过最大建议值({MAX_WORKERS}),自动将 max_workers 调整为 {MAX_WORKERS}")
96
+ max_workers = MAX_WORKERS
97
+ expected_threads = max_workers * 10
98
+ if pool_size < expected_threads:
99
+ logger.warning(f"pool_size({pool_size}) < max_workers({max_workers}) * 10,自动将 pool_size 调整为 {expected_threads}")
100
+ pool_size = expected_threads
101
+ if pool_size > MAX_POOL_SIZE:
102
+ logger.warning(f"pool_size({pool_size}) 超过最大建议值({MAX_POOL_SIZE}),自动将 pool_size 调整为 {MAX_POOL_SIZE}")
103
+ pool_size = MAX_POOL_SIZE
104
+ self.max_workers = max_workers
105
+ self.pool_size = pool_size
106
+
90
107
  # 配置参数
91
- self.max_workers = min(max(1, max_workers), pool_size) # 限制最大线程数,不能超过连接池
92
108
  self.batch_size = batch_size
93
109
  self.skip_system_dbs = skip_system_dbs
94
110
  self.max_retries = max_retries
95
- self.retry_interval = retry_interval
111
+ self.retry_waiting_time = retry_waiting_time
96
112
  self.primary_key = primary_key
97
113
 
98
- # 时间范围参数(只保留解析后的结果,去除冗余原始参数)
114
+ # 时间范围参数
99
115
  self.date_column = date_column
100
116
  self._dedup_start_date = None
101
117
  self._dedup_end_date = None
@@ -128,6 +144,9 @@ class MySQLDeduplicator:
128
144
  year -= 1
129
145
  self._dedup_start_date = f"{year}-{month:02d}-01"
130
146
  self._dedup_end_date = today.strftime("%Y-%m-%d")
147
+
148
+ if self._dedup_start_date and self._dedup_end_date:
149
+ logger.info('去重日期范围', {'开始': self._dedup_start_date, '结束': self._dedup_end_date})
131
150
 
132
151
  # 排除列处理,直接合并去重
133
152
  self.exclude_columns = list(set((exclude_columns or []) + ['id', '更新时间']))
@@ -164,6 +183,14 @@ class MySQLDeduplicator:
164
183
  logger.error(f"获取数据库连接失败: {str(e)}", {'error_type': type(e).__name__})
165
184
  raise ConnectionError(f"连接数据库失败: {str(e)}")
166
185
 
186
+ @contextmanager
187
+ def _conn_ctx(self):
188
+ conn = self._get_connection()
189
+ try:
190
+ yield conn
191
+ finally:
192
+ conn.close()
193
+
167
194
  @staticmethod
168
195
  def _retry_on_failure(func: Any) -> Any:
169
196
  """
@@ -187,7 +214,7 @@ class MySQLDeduplicator:
187
214
  except (pymysql.OperationalError, pymysql.InterfaceError) as e:
188
215
  last_exception = e
189
216
  if attempt < self.max_retries:
190
- wait_time = self.retry_interval * (attempt + 1)
217
+ wait_time = self.retry_waiting_time * (attempt + 1)
191
218
  logger.warning(
192
219
  f"数据库操作失败,准备重试 (尝试 {attempt + 1}/{self.max_retries})",
193
220
  {'error': str(e), 'wait_time': wait_time, 'func': func.__name__})
@@ -203,7 +230,6 @@ class MySQLDeduplicator:
203
230
  raise Exception("未知错误")
204
231
  return wrapper
205
232
 
206
- @_retry_on_failure
207
233
  def _get_databases(self) -> List[str]:
208
234
  """
209
235
  获取所有非系统数据库列表,排除 exclude_databases。
@@ -212,7 +238,7 @@ class MySQLDeduplicator:
212
238
  List[str]: 数据库名列表。
213
239
  """
214
240
  sql = "SHOW DATABASES"
215
- with self._get_connection() as conn:
241
+ with self._conn_ctx() as conn:
216
242
  with conn.cursor() as cursor:
217
243
  cursor.execute(sql)
218
244
  all_dbs = [row['Database'] for row in cursor.fetchall()]
@@ -220,7 +246,6 @@ class MySQLDeduplicator:
220
246
  filtered = [db for db in all_dbs if db.lower() not in self.SYSTEM_DATABASES and db.lower() not in self.exclude_databases] if self.skip_system_dbs else [db for db in all_dbs if db.lower() not in self.exclude_databases]
221
247
  return filtered
222
248
 
223
- @_retry_on_failure
224
249
  def _get_tables(self, database: str) -> List[str]:
225
250
  """
226
251
  获取指定数据库的所有表名(排除 temp_ 前缀的临时表)。
@@ -231,15 +256,12 @@ class MySQLDeduplicator:
231
256
  List[str]: 表名列表。
232
257
  """
233
258
  sql = "SHOW TABLES"
234
-
235
- with self._get_connection() as conn:
259
+ with self._conn_ctx() as conn:
236
260
  with conn.cursor() as cursor:
237
261
  cursor.execute(f"USE `{database}`")
238
262
  cursor.execute(sql)
239
- # 严格过滤所有以'temp_'为前缀的表名(如temp_xxx、temp_xxx_dedup_...、temp_xxx_reorderid_...等)
240
263
  return [row[f'Tables_in_{database}'] for row in cursor.fetchall() if not re.match(r'^temp_.*', row[f'Tables_in_{database}'])]
241
264
 
242
- @_retry_on_failure
243
265
  def _get_table_columns(self, database: str, table: str) -> List[str]:
244
266
  """
245
267
  获取指定表的所有列名(排除主键列)。
@@ -256,14 +278,12 @@ class MySQLDeduplicator:
256
278
  WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
257
279
  ORDER BY ORDINAL_POSITION
258
280
  """
259
-
260
- with self._get_connection() as conn:
281
+ with self._conn_ctx() as conn:
261
282
  with conn.cursor() as cursor:
262
283
  cursor.execute(sql, (database, table))
263
284
  return [row['COLUMN_NAME'] for row in cursor.fetchall()
264
285
  if row['COLUMN_NAME'].lower() != self.primary_key.lower()]
265
286
 
266
- @_retry_on_failure
267
287
  def _ensure_index(self, database: str, table: str, date_column: str) -> None:
268
288
  """
269
289
  检查并为 date_column 自动创建索引(如果未存在)。
@@ -273,7 +293,7 @@ class MySQLDeduplicator:
273
293
  table (str): 表名。
274
294
  date_column (str): 需要检查的日期列名。
275
295
  """
276
- with self._get_connection() as conn:
296
+ with self._conn_ctx() as conn:
277
297
  with conn.cursor() as cursor:
278
298
  # 检查索引是否已存在
279
299
  cursor.execute(
@@ -295,7 +315,33 @@ class MySQLDeduplicator:
295
315
  except Exception as e:
296
316
  logger.error('自动创建date_column索引失败', {"库": database, "表": table, "date_column": date_column, "异常": str(e)})
297
317
 
298
- @_retry_on_failure
318
+ def _row_generator(self, database, table, select_cols, select_where, batch_size=10000):
319
+ """
320
+ 生成器:分批拉取表数据,避免一次性加载全部数据到内存。
321
+ Args:
322
+ database (str): 数据库名。
323
+ table (str): 表名。
324
+ select_cols (str): 选择的列字符串。
325
+ select_where (str): where条件字符串。
326
+ batch_size (int): 每批拉取的行数。
327
+ Yields:
328
+ dict: 每行数据。
329
+ """
330
+ offset = 0
331
+ while True:
332
+ sql = f"SELECT {select_cols} FROM `{database}`.`{table}` {select_where} LIMIT {batch_size} OFFSET {offset}"
333
+ with self._conn_ctx() as conn:
334
+ with conn.cursor() as cursor:
335
+ cursor.execute(sql)
336
+ rows = cursor.fetchall()
337
+ if not rows:
338
+ break
339
+ for row in rows:
340
+ yield row
341
+ if len(rows) < batch_size:
342
+ break
343
+ offset += batch_size
344
+
299
345
  def _get_all_dates(self, database: str, table: str, date_column: str) -> List[str]:
300
346
  """
301
347
  获取表中所有不同的日期分区(按天)。
@@ -308,7 +354,7 @@ class MySQLDeduplicator:
308
354
  List[str]: 所有不同的日期(字符串)。
309
355
  """
310
356
  sql = f"SELECT DISTINCT `{date_column}` FROM `{database}`.`{table}` ORDER BY `{date_column}` ASC"
311
- with self._get_connection() as conn:
357
+ with self._conn_ctx() as conn:
312
358
  with conn.cursor() as cursor:
313
359
  cursor.execute(sql)
314
360
  return [row[date_column] for row in cursor.fetchall() if row[date_column] is not None]
@@ -367,7 +413,7 @@ class MySQLDeduplicator:
367
413
  pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
368
414
  where_sql = f"t.`{time_col}` = '{date_val}'"
369
415
  # 获取原始数据总量(只统计当天数据)
370
- with self._get_connection() as conn:
416
+ with self._conn_ctx() as conn:
371
417
  with conn.cursor() as cursor:
372
418
  count_where = f"WHERE `{time_col}` = '{date_val}'"
373
419
  count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}` {count_where}"
@@ -394,7 +440,7 @@ class MySQLDeduplicator:
394
440
  del_ids.extend(ids[1:])
395
441
  affected_rows = 0
396
442
  if not dry_run and del_ids:
397
- with self._get_connection() as conn:
443
+ with self._conn_ctx() as conn:
398
444
  with conn.cursor() as cursor:
399
445
  for i in range(0, len(del_ids), self.batch_size):
400
446
  batch_ids = del_ids[i:i+self.batch_size]
@@ -418,7 +464,7 @@ class MySQLDeduplicator:
418
464
  GROUP BY {column_list}
419
465
  HAVING COUNT(*) > 1
420
466
  """
421
- with self._get_connection() as conn:
467
+ with self._conn_ctx() as conn:
422
468
  with conn.cursor() as cursor:
423
469
  logger.debug('创建临时表SQL', {'sql': create_temp_sql})
424
470
  cursor.execute(create_temp_sql)
@@ -484,7 +530,7 @@ class MySQLDeduplicator:
484
530
  pk = self.primary_key
485
531
  pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
486
532
  # 获取原始数据总量
487
- with self._get_connection() as conn:
533
+ with self._conn_ctx() as conn:
488
534
  with conn.cursor() as cursor:
489
535
  count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`"
490
536
  logger.debug('执行SQL', {'sql': count_sql})
@@ -508,7 +554,7 @@ class MySQLDeduplicator:
508
554
  del_ids.extend(ids[1:])
509
555
  affected_rows = 0
510
556
  if not dry_run and del_ids:
511
- with self._get_connection() as conn:
557
+ with self._conn_ctx() as conn:
512
558
  with conn.cursor() as cursor:
513
559
  for i in range(0, len(del_ids), self.batch_size):
514
560
  batch_ids = del_ids[i:i+self.batch_size]
@@ -529,7 +575,7 @@ class MySQLDeduplicator:
529
575
  GROUP BY {column_list}
530
576
  HAVING COUNT(*) > 1
531
577
  """
532
- with self._get_connection() as conn:
578
+ with self._conn_ctx() as conn:
533
579
  with conn.cursor() as cursor:
534
580
  logger.debug('创建临时表SQL', {'sql': create_temp_sql})
535
581
  cursor.execute(create_temp_sql)
@@ -584,7 +630,7 @@ class MySQLDeduplicator:
584
630
  logger.error('异常', {"库": database, "表": table, "异常": str(e), 'func': sys._getframe().f_code.co_name, 'traceback': repr(e)})
585
631
  if temp_table:
586
632
  try:
587
- with self._get_connection() as conn:
633
+ with self._conn_ctx() as conn:
588
634
  with conn.cursor() as cursor:
589
635
  drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
590
636
  cursor.execute(drop_temp_sql)
@@ -628,13 +674,14 @@ class MySQLDeduplicator:
628
674
  logger.info('单表开始', {
629
675
  "库": database,
630
676
  "表": table,
631
- "参数": {
632
- "指定去重列": columns,
633
- "去重方式": "Python" if use_python_dedup else "SQL",
634
- "数据处理": self.duplicate_keep_mode,
635
- "模拟运行": dry_run,
636
- '排除列': self.exclude_columns,
637
- }})
677
+ # "参数": {
678
+ # "指定去重列": columns,
679
+ # "去重方式": "Python" if use_python_dedup else "SQL",
680
+ # "数据处理": self.duplicate_keep_mode,
681
+ # "模拟运行": dry_run,
682
+ # '排除列': self.exclude_columns,
683
+ # },
684
+ })
638
685
  all_columns = self._get_table_columns(database, table)
639
686
  all_columns_lower = [col.lower() for col in all_columns]
640
687
  time_col = self.date_column
@@ -680,7 +727,7 @@ class MySQLDeduplicator:
680
727
  logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
681
728
  total_dup += dup_count
682
729
  total_del += affected_rows
683
- logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": (total_dup, total_del), '日期范围': f"{start_date} - {end_date}"})
730
+ logger.debug('单表完成', {"库": database, "表": table, "结果[重复, 删除]": (total_dup, total_del), '日期范围': f"{start_date} - {end_date}"})
684
731
  # 自动重排id列(仅当有实际删除时且reorder_id为True)
685
732
  if reorder_id and total_del > 0:
686
733
  try:
@@ -688,10 +735,12 @@ class MySQLDeduplicator:
688
735
  logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
689
736
  except Exception as e:
690
737
  logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
738
+ if affected_rows > 0:
739
+ logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": total_dup, "实际删除": total_del})
691
740
  return (total_dup, total_del)
692
741
  # 没有date_column,直接全表去重
693
742
  result = self._deduplicate_table(database, table, columns, dry_run, use_python_dedup, date_val=None)
694
- logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result, '日期范围': '全表'})
743
+ logger.debug('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result, '日期范围': '全表'})
695
744
  dup_count, affected_rows = result
696
745
  if reorder_id and affected_rows > 0:
697
746
  try:
@@ -699,6 +748,8 @@ class MySQLDeduplicator:
699
748
  logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
700
749
  except Exception as e:
701
750
  logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
751
+ if affected_rows > 0:
752
+ logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": dup_count, "实际删除": affected_rows})
702
753
  return result
703
754
  except Exception as e:
704
755
  logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
@@ -770,7 +821,11 @@ class MySQLDeduplicator:
770
821
  results[table] = (dup_count, affected_rows)
771
822
  total_dup = sum(r[0] for r in results.values())
772
823
  total_del = sum(r[1] for r in results.values())
773
- logger.info('库完成', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": results})
824
+ logger.debug('库完成', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": results})
825
+ # 只显示有删除的详细结果
826
+ if total_del > 0:
827
+ filtered_results = {tbl: res for tbl, res in results.items() if res[1] > 0}
828
+ logger.info('库完成(仅显示有删除的结果)', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": filtered_results})
774
829
  return results
775
830
  except Exception as e:
776
831
  logger.error('发生全局错误', {"库": database, 'func': sys._getframe().f_code.co_name, "error": str(e), 'traceback': repr(e)})
@@ -819,7 +874,8 @@ class MySQLDeduplicator:
819
874
  'use_python_dedup': use_python_dedup
820
875
  },
821
876
  })
822
- if parallel and self.max_workers > 1:
877
+ # 如果parallel=True且库数量大于1,则只在外层并发,内层串行
878
+ if parallel and self.max_workers > 1 and len(target_dbs) > 1:
823
879
  with concurrent.futures.ThreadPoolExecutor(
824
880
  max_workers=self.max_workers
825
881
  ) as executor:
@@ -827,6 +883,7 @@ class MySQLDeduplicator:
827
883
  for db in target_dbs:
828
884
  tables = tables_map.get(db) if tables_map else None
829
885
  db_columns_map = columns_map.get(db) if columns_map else None
886
+ # 内层强制串行
830
887
  futures[executor.submit(
831
888
  self.deduplicate_database,
832
889
  db, tables, db_columns_map, dry_run, False, reorder_id, use_python_dedup
@@ -855,7 +912,7 @@ class MySQLDeduplicator:
855
912
  r[1] for db in all_results.values()
856
913
  for r in db.values()
857
914
  )
858
- logger.info('全局完成', {
915
+ logger.debug('全局完成', {
859
916
  "总重复组": total_dup,
860
917
  "总删除行": total_del,
861
918
  "参数": {
@@ -867,12 +924,30 @@ class MySQLDeduplicator:
867
924
  },
868
925
  "详细结果": dict(all_results)
869
926
  })
927
+ # 只显示有删除的详细结果
928
+ if total_del > 0:
929
+ filtered_results = {
930
+ db: {tbl: res for tbl, res in tbls.items() if res[1] > 0}
931
+ for db, tbls in all_results.items()
932
+ }
933
+ filtered_results = {db: tbls for db, tbls in filtered_results.items() if tbls}
934
+ logger.info('全局完成(仅显示有删除的结果)', {
935
+ "总重复组": total_dup,
936
+ "总删除行": total_del,
937
+ "参数": {
938
+ "模拟运行": dry_run,
939
+ "并行处理": parallel,
940
+ '排除列': self.exclude_columns,
941
+ '重排id': reorder_id,
942
+ 'use_python_dedup': use_python_dedup
943
+ },
944
+ "详细结果": filtered_results
945
+ })
870
946
  return all_results
871
947
  except Exception as e:
872
948
  logger.error('异常', {"error": str(e), 'traceback': repr(e)})
873
949
  return all_results
874
950
 
875
- @_retry_on_failure
876
951
  def _check_database_exists(self, database: str) -> bool:
877
952
  """
878
953
  检查数据库是否存在。
@@ -883,13 +958,11 @@ class MySQLDeduplicator:
883
958
  bool: 数据库是否存在。
884
959
  """
885
960
  sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
886
-
887
- with self._get_connection() as conn:
961
+ with self._conn_ctx() as conn:
888
962
  with conn.cursor() as cursor:
889
963
  cursor.execute(sql, (database,))
890
964
  return bool(cursor.fetchone())
891
965
 
892
- @_retry_on_failure
893
966
  def _check_table_exists(self, database: str, table: str) -> bool:
894
967
  """
895
968
  检查表是否存在。
@@ -905,13 +978,11 @@ class MySQLDeduplicator:
905
978
  FROM INFORMATION_SCHEMA.TABLES
906
979
  WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
907
980
  """
908
-
909
- with self._get_connection() as conn:
981
+ with self._conn_ctx() as conn:
910
982
  with conn.cursor() as cursor:
911
983
  cursor.execute(sql, (database, table))
912
984
  return bool(cursor.fetchone())
913
985
 
914
- @_retry_on_failure
915
986
  def _get_table_info(self, database: str, table: str, id_column: str = None):
916
987
  """
917
988
  获取表的所有列名、主键列名列表、指定id列是否为主键。
@@ -923,7 +994,7 @@ class MySQLDeduplicator:
923
994
  Tuple[List[str], List[str], bool]: (所有列名, 主键列名, id列是否为主键)
924
995
  """
925
996
  id_column = id_column or self.primary_key
926
- with self._get_connection() as conn:
997
+ with self._conn_ctx() as conn:
927
998
  with conn.cursor() as cursor:
928
999
  cursor.execute("""
929
1000
  SELECT COLUMN_NAME, COLUMN_KEY
@@ -1032,7 +1103,7 @@ class MySQLDeduplicator:
1032
1103
  logger.warning('主键不是单列id,跳过id重排', {"库": database, "表": table, "主键列": pk_cols})
1033
1104
  return False
1034
1105
  # 检查外键约束
1035
- with self._get_connection() as conn:
1106
+ with self._conn_ctx() as conn:
1036
1107
  with conn.cursor() as cursor:
1037
1108
  cursor.execute("""
1038
1109
  SELECT * FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
@@ -1042,7 +1113,7 @@ class MySQLDeduplicator:
1042
1113
  logger.warning('表存在外键约束,跳过id重排', {"库": database, "表": table})
1043
1114
  return False
1044
1115
  # 获取表结构
1045
- with self._get_connection() as conn:
1116
+ with self._conn_ctx() as conn:
1046
1117
  with conn.cursor() as cursor:
1047
1118
  cursor.execute(f"SHOW CREATE TABLE {table_quoted}")
1048
1119
  create_table_sql = cursor.fetchone()['Create Table']
@@ -1055,7 +1126,7 @@ class MySQLDeduplicator:
1055
1126
  backup_table = self._make_backup_table_name(table)
1056
1127
  backup_table_quoted = f"`{database}`.`{backup_table}`"
1057
1128
  try:
1058
- with self._get_connection() as conn:
1129
+ with self._conn_ctx() as conn:
1059
1130
  with conn.cursor() as cursor:
1060
1131
  # 1. 创建临时表,结构同原表
1061
1132
  try:
@@ -1116,7 +1187,7 @@ class MySQLDeduplicator:
1116
1187
  logger.error('回滚恢复原表失败', {"库": database, "表": table, "异常": str(e)})
1117
1188
  return False
1118
1189
  logger.info('id重排成功且数据量一致', {"库": database, "表": table, "新表": new_cnt, "备份表": old_cnt, "备份表名": backup_table})
1119
- # 5. 可选:自动删除备份表
1190
+ # 5. 自动删除备份表
1120
1191
  if auto_drop_backup:
1121
1192
  try:
1122
1193
  cursor.execute(f"DROP TABLE {backup_table_quoted}")
@@ -1127,7 +1198,7 @@ class MySQLDeduplicator:
1127
1198
  except Exception as e:
1128
1199
  logger.error('id重排异常,准备回滚', {"库": database, "表": table, "异常": str(e)})
1129
1200
  # 回滚:如临时表存在则删掉,恢复原表结构
1130
- with self._get_connection() as conn:
1201
+ with self._conn_ctx() as conn:
1131
1202
  with conn.cursor() as cursor:
1132
1203
  try:
1133
1204
  cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
@@ -1135,7 +1206,7 @@ class MySQLDeduplicator:
1135
1206
  logger.error('回滚时删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
1136
1207
  # 恢复原表(如备份表存在)
1137
1208
  try:
1138
- with self._get_connection() as conn2:
1209
+ with self._conn_ctx() as conn2:
1139
1210
  with conn2.cursor() as cursor2:
1140
1211
  if self._check_table_exists(database, backup_table):
1141
1212
  cursor2.execute(f"DROP TABLE IF EXISTS {table_quoted}")
@@ -1227,23 +1298,16 @@ def main():
1227
1298
  batch_size=1000,
1228
1299
  skip_system_dbs=True,
1229
1300
  max_retries=3,
1230
- retry_interval=5,
1231
- pool_size=5,
1301
+ retry_waiting_time=5,
1302
+ # pool_size=30,
1232
1303
  recent_month=1,
1233
1304
  # date_range=['2025-06-09', '2025-06-10'],
1234
1305
  date_column='日期',
1235
- exclude_columns=None,
1236
1306
  exclude_databases=['测试库4'],
1237
1307
  exclude_tables={
1238
1308
  '推广数据2': [
1239
1309
  '地域报表_城市_2025_04',
1240
- '地域报表_城市_2025_05',
1241
- '地域报表_城市_2025_06',
1242
1310
  # '地域报表_城市_2025_04_copy1',
1243
- # '地域报表_城市_2025_05_copy1',
1244
- # '地域报表_城市_2025_06_copy1',
1245
- '奥莱店_主体报表',
1246
- # '奥莱店_主体报表_copy1',
1247
1311
  ],
1248
1312
  "生意参谋3": [
1249
1313
  "商品排行_2025",
@@ -1255,10 +1319,10 @@ def main():
1255
1319
  deduplicator.deduplicate_all(dry_run=False, parallel=True, reorder_id=True)
1256
1320
 
1257
1321
  # # 指定数据库去重(多线程)
1258
- # deduplicator.deduplicate_database('推广数据2', dry_run=False, parallel=True, reorder_id=True)
1322
+ # deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True, reorder_id=True)
1259
1323
 
1260
1324
  # # 指定表去重(使用特定列)
1261
- # deduplicator.deduplicate_table('推广数据2', '地域报表_城市_2025_06_copy1', columns=[], dry_run=False, reorder_id=True)
1325
+ # deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'data'], dry_run=False, reorder_id=True)
1262
1326
 
1263
1327
  # # 重排id列
1264
1328
  # deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 3.11.11
3
+ Version: 3.12.0
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,5 +1,5 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=GrY3av2BYeEaosI2qWYizQyTwyijdq8IuOuFjTJqLxE,19
2
+ mdbq/__version__.py,sha256=W8WVhYkHLU0SBDlL9Q6XQVTqIrzYjc1kFBZgqzS_NEI,18
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
4
  mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
5
5
  mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
@@ -8,7 +8,7 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
8
8
  mdbq/log/mylogger.py,sha256=Crw6LwVo3I3IUbzIETu8f46Quza3CTCh-qYf4edbBPo,24139
9
9
  mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
10
10
  mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
11
- mdbq/mysql/deduplicator.py,sha256=e84MLhWjdCoDB8GxUV-z5drn8hdKGlJKnHzNW0rjIM8,65345
11
+ mdbq/mysql/deduplicator.py,sha256=KMJ_YyqAniaLVRqOHLgO92PgwknIDB-EgaOY7S6iMZ4,68599
12
12
  mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
13
13
  mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
14
14
  mdbq/mysql/uploader.py,sha256=8Px_W2bYOr1wQgMXMK0DggNiuE6a6Ul4BlJake8LSo8,64469
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
24
24
  mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
25
25
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
26
26
  mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
27
- mdbq-3.11.11.dist-info/METADATA,sha256=NHTu8tsBwtvh90jaiNN4E4i9SW5xkH6P-yYcBrxwSbU,365
28
- mdbq-3.11.11.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
- mdbq-3.11.11.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
- mdbq-3.11.11.dist-info/RECORD,,
27
+ mdbq-3.12.0.dist-info/METADATA,sha256=Q6EyaC61H4okFva6YFV2a0Y3Iqun8L8mnpSkeVXcFdc,364
28
+ mdbq-3.12.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
+ mdbq-3.12.0.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
+ mdbq-3.12.0.dist-info/RECORD,,
File without changes