mdbq 3.11.11__py3-none-any.whl → 3.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.11.11'
1
+ VERSION = '3.12.1'
@@ -14,6 +14,7 @@ from collections import defaultdict
14
14
  import sys
15
15
  from datetime import datetime
16
16
  import uuid
17
+ from contextlib import contextmanager
17
18
 
18
19
 
19
20
  warnings.filterwarnings('ignore')
@@ -47,8 +48,8 @@ class MySQLDeduplicator:
47
48
  batch_size: int = 1000,
48
49
  skip_system_dbs: bool = True,
49
50
  max_retries: int = 3,
50
- retry_interval: int = 5,
51
- pool_size: int = 5,
51
+ retry_waiting_time: int = 5,
52
+ pool_size: int = 10,
52
53
  primary_key: str = 'id',
53
54
  date_range: Optional[List[str]] = None,
54
55
  recent_month: Optional[int] = None,
@@ -87,15 +88,30 @@ class MySQLDeduplicator:
87
88
  cursorclass=pymysql.cursors.DictCursor
88
89
  )
89
90
 
91
+ # 并发模式要将 pool_size 加大
92
+ MAX_POOL_SIZE = 200
93
+ MAX_WORKERS = 4
94
+ if max_workers > MAX_WORKERS:
95
+ logger.warning(f"max_workers({max_workers}) 超过最大建议值({MAX_WORKERS}),自动将 max_workers 调整为 {MAX_WORKERS}")
96
+ max_workers = MAX_WORKERS
97
+ expected_threads = max_workers * 10
98
+ if pool_size < expected_threads:
99
+ logger.warning(f"pool_size({pool_size}) < max_workers({max_workers}) * 10,自动将 pool_size 调整为 {expected_threads}")
100
+ pool_size = expected_threads
101
+ if pool_size > MAX_POOL_SIZE:
102
+ logger.warning(f"pool_size({pool_size}) 超过最大建议值({MAX_POOL_SIZE}),自动将 pool_size 调整为 {MAX_POOL_SIZE}")
103
+ pool_size = MAX_POOL_SIZE
104
+ self.max_workers = max_workers
105
+ self.pool_size = pool_size
106
+
90
107
  # 配置参数
91
- self.max_workers = min(max(1, max_workers), pool_size) # 限制最大线程数,不能超过连接池
92
108
  self.batch_size = batch_size
93
109
  self.skip_system_dbs = skip_system_dbs
94
110
  self.max_retries = max_retries
95
- self.retry_interval = retry_interval
111
+ self.retry_waiting_time = retry_waiting_time
96
112
  self.primary_key = primary_key
97
113
 
98
- # 时间范围参数(只保留解析后的结果,去除冗余原始参数)
114
+ # 时间范围参数
99
115
  self.date_column = date_column
100
116
  self._dedup_start_date = None
101
117
  self._dedup_end_date = None
@@ -128,6 +144,9 @@ class MySQLDeduplicator:
128
144
  year -= 1
129
145
  self._dedup_start_date = f"{year}-{month:02d}-01"
130
146
  self._dedup_end_date = today.strftime("%Y-%m-%d")
147
+
148
+ if self._dedup_start_date and self._dedup_end_date:
149
+ logger.info('去重日期范围', {'开始': self._dedup_start_date, '结束': self._dedup_end_date})
131
150
 
132
151
  # 排除列处理,直接合并去重
133
152
  self.exclude_columns = list(set((exclude_columns or []) + ['id', '更新时间']))
@@ -164,6 +183,14 @@ class MySQLDeduplicator:
164
183
  logger.error(f"获取数据库连接失败: {str(e)}", {'error_type': type(e).__name__})
165
184
  raise ConnectionError(f"连接数据库失败: {str(e)}")
166
185
 
186
+ @contextmanager
187
+ def _conn_ctx(self):
188
+ conn = self._get_connection()
189
+ try:
190
+ yield conn
191
+ finally:
192
+ conn.close()
193
+
167
194
  @staticmethod
168
195
  def _retry_on_failure(func: Any) -> Any:
169
196
  """
@@ -187,7 +214,7 @@ class MySQLDeduplicator:
187
214
  except (pymysql.OperationalError, pymysql.InterfaceError) as e:
188
215
  last_exception = e
189
216
  if attempt < self.max_retries:
190
- wait_time = self.retry_interval * (attempt + 1)
217
+ wait_time = self.retry_waiting_time * (attempt + 1)
191
218
  logger.warning(
192
219
  f"数据库操作失败,准备重试 (尝试 {attempt + 1}/{self.max_retries})",
193
220
  {'error': str(e), 'wait_time': wait_time, 'func': func.__name__})
@@ -203,7 +230,6 @@ class MySQLDeduplicator:
203
230
  raise Exception("未知错误")
204
231
  return wrapper
205
232
 
206
- @_retry_on_failure
207
233
  def _get_databases(self) -> List[str]:
208
234
  """
209
235
  获取所有非系统数据库列表,排除 exclude_databases。
@@ -212,7 +238,7 @@ class MySQLDeduplicator:
212
238
  List[str]: 数据库名列表。
213
239
  """
214
240
  sql = "SHOW DATABASES"
215
- with self._get_connection() as conn:
241
+ with self._conn_ctx() as conn:
216
242
  with conn.cursor() as cursor:
217
243
  cursor.execute(sql)
218
244
  all_dbs = [row['Database'] for row in cursor.fetchall()]
@@ -220,7 +246,6 @@ class MySQLDeduplicator:
220
246
  filtered = [db for db in all_dbs if db.lower() not in self.SYSTEM_DATABASES and db.lower() not in self.exclude_databases] if self.skip_system_dbs else [db for db in all_dbs if db.lower() not in self.exclude_databases]
221
247
  return filtered
222
248
 
223
- @_retry_on_failure
224
249
  def _get_tables(self, database: str) -> List[str]:
225
250
  """
226
251
  获取指定数据库的所有表名(排除 temp_ 前缀的临时表)。
@@ -231,15 +256,12 @@ class MySQLDeduplicator:
231
256
  List[str]: 表名列表。
232
257
  """
233
258
  sql = "SHOW TABLES"
234
-
235
- with self._get_connection() as conn:
259
+ with self._conn_ctx() as conn:
236
260
  with conn.cursor() as cursor:
237
261
  cursor.execute(f"USE `{database}`")
238
262
  cursor.execute(sql)
239
- # 严格过滤所有以'temp_'为前缀的表名(如temp_xxx、temp_xxx_dedup_...、temp_xxx_reorderid_...等)
240
263
  return [row[f'Tables_in_{database}'] for row in cursor.fetchall() if not re.match(r'^temp_.*', row[f'Tables_in_{database}'])]
241
264
 
242
- @_retry_on_failure
243
265
  def _get_table_columns(self, database: str, table: str) -> List[str]:
244
266
  """
245
267
  获取指定表的所有列名(排除主键列)。
@@ -256,14 +278,12 @@ class MySQLDeduplicator:
256
278
  WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
257
279
  ORDER BY ORDINAL_POSITION
258
280
  """
259
-
260
- with self._get_connection() as conn:
281
+ with self._conn_ctx() as conn:
261
282
  with conn.cursor() as cursor:
262
283
  cursor.execute(sql, (database, table))
263
284
  return [row['COLUMN_NAME'] for row in cursor.fetchall()
264
285
  if row['COLUMN_NAME'].lower() != self.primary_key.lower()]
265
286
 
266
- @_retry_on_failure
267
287
  def _ensure_index(self, database: str, table: str, date_column: str) -> None:
268
288
  """
269
289
  检查并为 date_column 自动创建索引(如果未存在)。
@@ -273,7 +293,7 @@ class MySQLDeduplicator:
273
293
  table (str): 表名。
274
294
  date_column (str): 需要检查的日期列名。
275
295
  """
276
- with self._get_connection() as conn:
296
+ with self._conn_ctx() as conn:
277
297
  with conn.cursor() as cursor:
278
298
  # 检查索引是否已存在
279
299
  cursor.execute(
@@ -295,7 +315,33 @@ class MySQLDeduplicator:
295
315
  except Exception as e:
296
316
  logger.error('自动创建date_column索引失败', {"库": database, "表": table, "date_column": date_column, "异常": str(e)})
297
317
 
298
- @_retry_on_failure
318
+ def _row_generator(self, database, table, select_cols, select_where, batch_size=10000):
319
+ """
320
+ 生成器:分批拉取表数据,避免一次性加载全部数据到内存。
321
+ Args:
322
+ database (str): 数据库名。
323
+ table (str): 表名。
324
+ select_cols (str): 选择的列字符串。
325
+ select_where (str): where条件字符串。
326
+ batch_size (int): 每批拉取的行数。
327
+ Yields:
328
+ dict: 每行数据。
329
+ """
330
+ offset = 0
331
+ while True:
332
+ sql = f"SELECT {select_cols} FROM `{database}`.`{table}` {select_where} LIMIT {batch_size} OFFSET {offset}"
333
+ with self._conn_ctx() as conn:
334
+ with conn.cursor() as cursor:
335
+ cursor.execute(sql)
336
+ rows = cursor.fetchall()
337
+ if not rows:
338
+ break
339
+ for row in rows:
340
+ yield row
341
+ if len(rows) < batch_size:
342
+ break
343
+ offset += batch_size
344
+
299
345
  def _get_all_dates(self, database: str, table: str, date_column: str) -> List[str]:
300
346
  """
301
347
  获取表中所有不同的日期分区(按天)。
@@ -308,7 +354,7 @@ class MySQLDeduplicator:
308
354
  List[str]: 所有不同的日期(字符串)。
309
355
  """
310
356
  sql = f"SELECT DISTINCT `{date_column}` FROM `{database}`.`{table}` ORDER BY `{date_column}` ASC"
311
- with self._get_connection() as conn:
357
+ with self._conn_ctx() as conn:
312
358
  with conn.cursor() as cursor:
313
359
  cursor.execute(sql)
314
360
  return [row[date_column] for row in cursor.fetchall() if row[date_column] is not None]
@@ -367,7 +413,7 @@ class MySQLDeduplicator:
367
413
  pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
368
414
  where_sql = f"t.`{time_col}` = '{date_val}'"
369
415
  # 获取原始数据总量(只统计当天数据)
370
- with self._get_connection() as conn:
416
+ with self._conn_ctx() as conn:
371
417
  with conn.cursor() as cursor:
372
418
  count_where = f"WHERE `{time_col}` = '{date_val}'"
373
419
  count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}` {count_where}"
@@ -394,7 +440,7 @@ class MySQLDeduplicator:
394
440
  del_ids.extend(ids[1:])
395
441
  affected_rows = 0
396
442
  if not dry_run and del_ids:
397
- with self._get_connection() as conn:
443
+ with self._conn_ctx() as conn:
398
444
  with conn.cursor() as cursor:
399
445
  for i in range(0, len(del_ids), self.batch_size):
400
446
  batch_ids = del_ids[i:i+self.batch_size]
@@ -418,7 +464,7 @@ class MySQLDeduplicator:
418
464
  GROUP BY {column_list}
419
465
  HAVING COUNT(*) > 1
420
466
  """
421
- with self._get_connection() as conn:
467
+ with self._conn_ctx() as conn:
422
468
  with conn.cursor() as cursor:
423
469
  logger.debug('创建临时表SQL', {'sql': create_temp_sql})
424
470
  cursor.execute(create_temp_sql)
@@ -484,7 +530,7 @@ class MySQLDeduplicator:
484
530
  pk = self.primary_key
485
531
  pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
486
532
  # 获取原始数据总量
487
- with self._get_connection() as conn:
533
+ with self._conn_ctx() as conn:
488
534
  with conn.cursor() as cursor:
489
535
  count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`"
490
536
  logger.debug('执行SQL', {'sql': count_sql})
@@ -508,7 +554,7 @@ class MySQLDeduplicator:
508
554
  del_ids.extend(ids[1:])
509
555
  affected_rows = 0
510
556
  if not dry_run and del_ids:
511
- with self._get_connection() as conn:
557
+ with self._conn_ctx() as conn:
512
558
  with conn.cursor() as cursor:
513
559
  for i in range(0, len(del_ids), self.batch_size):
514
560
  batch_ids = del_ids[i:i+self.batch_size]
@@ -529,7 +575,7 @@ class MySQLDeduplicator:
529
575
  GROUP BY {column_list}
530
576
  HAVING COUNT(*) > 1
531
577
  """
532
- with self._get_connection() as conn:
578
+ with self._conn_ctx() as conn:
533
579
  with conn.cursor() as cursor:
534
580
  logger.debug('创建临时表SQL', {'sql': create_temp_sql})
535
581
  cursor.execute(create_temp_sql)
@@ -584,7 +630,7 @@ class MySQLDeduplicator:
584
630
  logger.error('异常', {"库": database, "表": table, "异常": str(e), 'func': sys._getframe().f_code.co_name, 'traceback': repr(e)})
585
631
  if temp_table:
586
632
  try:
587
- with self._get_connection() as conn:
633
+ with self._conn_ctx() as conn:
588
634
  with conn.cursor() as cursor:
589
635
  drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
590
636
  cursor.execute(drop_temp_sql)
@@ -628,13 +674,14 @@ class MySQLDeduplicator:
628
674
  logger.info('单表开始', {
629
675
  "库": database,
630
676
  "表": table,
631
- "参数": {
632
- "指定去重列": columns,
633
- "去重方式": "Python" if use_python_dedup else "SQL",
634
- "数据处理": self.duplicate_keep_mode,
635
- "模拟运行": dry_run,
636
- '排除列': self.exclude_columns,
637
- }})
677
+ # "参数": {
678
+ # "指定去重列": columns,
679
+ # "去重方式": "Python" if use_python_dedup else "SQL",
680
+ # "数据处理": self.duplicate_keep_mode,
681
+ # "模拟运行": dry_run,
682
+ # '排除列': self.exclude_columns,
683
+ # },
684
+ })
638
685
  all_columns = self._get_table_columns(database, table)
639
686
  all_columns_lower = [col.lower() for col in all_columns]
640
687
  time_col = self.date_column
@@ -680,7 +727,7 @@ class MySQLDeduplicator:
680
727
  logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
681
728
  total_dup += dup_count
682
729
  total_del += affected_rows
683
- logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": (total_dup, total_del), '日期范围': f"{start_date} - {end_date}"})
730
+ logger.debug('单表完成', {"库": database, "表": table, "结果[重复, 删除]": (total_dup, total_del), '日期范围': f"{start_date} - {end_date}"})
684
731
  # 自动重排id列(仅当有实际删除时且reorder_id为True)
685
732
  if reorder_id and total_del > 0:
686
733
  try:
@@ -688,10 +735,12 @@ class MySQLDeduplicator:
688
735
  logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
689
736
  except Exception as e:
690
737
  logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
738
+ if affected_rows > 0:
739
+ logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": total_dup, "实际删除": total_del})
691
740
  return (total_dup, total_del)
692
741
  # 没有date_column,直接全表去重
693
742
  result = self._deduplicate_table(database, table, columns, dry_run, use_python_dedup, date_val=None)
694
- logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result, '日期范围': '全表'})
743
+ logger.debug('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result, '日期范围': '全表'})
695
744
  dup_count, affected_rows = result
696
745
  if reorder_id and affected_rows > 0:
697
746
  try:
@@ -699,6 +748,8 @@ class MySQLDeduplicator:
699
748
  logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
700
749
  except Exception as e:
701
750
  logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
751
+ if affected_rows > 0:
752
+ logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": dup_count, "实际删除": affected_rows})
702
753
  return result
703
754
  except Exception as e:
704
755
  logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
@@ -770,7 +821,11 @@ class MySQLDeduplicator:
770
821
  results[table] = (dup_count, affected_rows)
771
822
  total_dup = sum(r[0] for r in results.values())
772
823
  total_del = sum(r[1] for r in results.values())
773
- logger.info('库完成', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": results})
824
+ logger.debug('库完成', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": results})
825
+ # 只显示有删除的详细结果
826
+ if total_del > 0:
827
+ filtered_results = {tbl: res for tbl, res in results.items() if res[1] > 0}
828
+ logger.info('库完成(仅显示有删除的结果)', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": filtered_results})
774
829
  return results
775
830
  except Exception as e:
776
831
  logger.error('发生全局错误', {"库": database, 'func': sys._getframe().f_code.co_name, "error": str(e), 'traceback': repr(e)})
@@ -819,7 +874,8 @@ class MySQLDeduplicator:
819
874
  'use_python_dedup': use_python_dedup
820
875
  },
821
876
  })
822
- if parallel and self.max_workers > 1:
877
+ # 如果parallel=True且库数量大于1,则只在外层并发,内层串行
878
+ if parallel and self.max_workers > 1 and len(target_dbs) > 1:
823
879
  with concurrent.futures.ThreadPoolExecutor(
824
880
  max_workers=self.max_workers
825
881
  ) as executor:
@@ -827,6 +883,7 @@ class MySQLDeduplicator:
827
883
  for db in target_dbs:
828
884
  tables = tables_map.get(db) if tables_map else None
829
885
  db_columns_map = columns_map.get(db) if columns_map else None
886
+ # 内层强制串行
830
887
  futures[executor.submit(
831
888
  self.deduplicate_database,
832
889
  db, tables, db_columns_map, dry_run, False, reorder_id, use_python_dedup
@@ -855,7 +912,7 @@ class MySQLDeduplicator:
855
912
  r[1] for db in all_results.values()
856
913
  for r in db.values()
857
914
  )
858
- logger.info('全局完成', {
915
+ logger.debug('全局完成', {
859
916
  "总重复组": total_dup,
860
917
  "总删除行": total_del,
861
918
  "参数": {
@@ -867,12 +924,30 @@ class MySQLDeduplicator:
867
924
  },
868
925
  "详细结果": dict(all_results)
869
926
  })
927
+ # 只显示有删除的详细结果
928
+ if total_del > 0:
929
+ filtered_results = {
930
+ db: {tbl: res for tbl, res in tbls.items() if res[1] > 0}
931
+ for db, tbls in all_results.items()
932
+ }
933
+ filtered_results = {db: tbls for db, tbls in filtered_results.items() if tbls}
934
+ logger.info('全局完成(仅显示有删除的结果)', {
935
+ "总重复组": total_dup,
936
+ "总删除行": total_del,
937
+ "参数": {
938
+ "模拟运行": dry_run,
939
+ "并行处理": parallel,
940
+ '排除列': self.exclude_columns,
941
+ '重排id': reorder_id,
942
+ 'use_python_dedup': use_python_dedup
943
+ },
944
+ "详细结果": filtered_results
945
+ })
870
946
  return all_results
871
947
  except Exception as e:
872
948
  logger.error('异常', {"error": str(e), 'traceback': repr(e)})
873
949
  return all_results
874
950
 
875
- @_retry_on_failure
876
951
  def _check_database_exists(self, database: str) -> bool:
877
952
  """
878
953
  检查数据库是否存在。
@@ -883,13 +958,11 @@ class MySQLDeduplicator:
883
958
  bool: 数据库是否存在。
884
959
  """
885
960
  sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
886
-
887
- with self._get_connection() as conn:
961
+ with self._conn_ctx() as conn:
888
962
  with conn.cursor() as cursor:
889
963
  cursor.execute(sql, (database,))
890
964
  return bool(cursor.fetchone())
891
965
 
892
- @_retry_on_failure
893
966
  def _check_table_exists(self, database: str, table: str) -> bool:
894
967
  """
895
968
  检查表是否存在。
@@ -905,13 +978,11 @@ class MySQLDeduplicator:
905
978
  FROM INFORMATION_SCHEMA.TABLES
906
979
  WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
907
980
  """
908
-
909
- with self._get_connection() as conn:
981
+ with self._conn_ctx() as conn:
910
982
  with conn.cursor() as cursor:
911
983
  cursor.execute(sql, (database, table))
912
984
  return bool(cursor.fetchone())
913
985
 
914
- @_retry_on_failure
915
986
  def _get_table_info(self, database: str, table: str, id_column: str = None):
916
987
  """
917
988
  获取表的所有列名、主键列名列表、指定id列是否为主键。
@@ -923,7 +994,7 @@ class MySQLDeduplicator:
923
994
  Tuple[List[str], List[str], bool]: (所有列名, 主键列名, id列是否为主键)
924
995
  """
925
996
  id_column = id_column or self.primary_key
926
- with self._get_connection() as conn:
997
+ with self._conn_ctx() as conn:
927
998
  with conn.cursor() as cursor:
928
999
  cursor.execute("""
929
1000
  SELECT COLUMN_NAME, COLUMN_KEY
@@ -1032,7 +1103,7 @@ class MySQLDeduplicator:
1032
1103
  logger.warning('主键不是单列id,跳过id重排', {"库": database, "表": table, "主键列": pk_cols})
1033
1104
  return False
1034
1105
  # 检查外键约束
1035
- with self._get_connection() as conn:
1106
+ with self._conn_ctx() as conn:
1036
1107
  with conn.cursor() as cursor:
1037
1108
  cursor.execute("""
1038
1109
  SELECT * FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
@@ -1042,7 +1113,7 @@ class MySQLDeduplicator:
1042
1113
  logger.warning('表存在外键约束,跳过id重排', {"库": database, "表": table})
1043
1114
  return False
1044
1115
  # 获取表结构
1045
- with self._get_connection() as conn:
1116
+ with self._conn_ctx() as conn:
1046
1117
  with conn.cursor() as cursor:
1047
1118
  cursor.execute(f"SHOW CREATE TABLE {table_quoted}")
1048
1119
  create_table_sql = cursor.fetchone()['Create Table']
@@ -1055,7 +1126,7 @@ class MySQLDeduplicator:
1055
1126
  backup_table = self._make_backup_table_name(table)
1056
1127
  backup_table_quoted = f"`{database}`.`{backup_table}`"
1057
1128
  try:
1058
- with self._get_connection() as conn:
1129
+ with self._conn_ctx() as conn:
1059
1130
  with conn.cursor() as cursor:
1060
1131
  # 1. 创建临时表,结构同原表
1061
1132
  try:
@@ -1116,7 +1187,7 @@ class MySQLDeduplicator:
1116
1187
  logger.error('回滚恢复原表失败', {"库": database, "表": table, "异常": str(e)})
1117
1188
  return False
1118
1189
  logger.info('id重排成功且数据量一致', {"库": database, "表": table, "新表": new_cnt, "备份表": old_cnt, "备份表名": backup_table})
1119
- # 5. 可选:自动删除备份表
1190
+ # 5. 自动删除备份表
1120
1191
  if auto_drop_backup:
1121
1192
  try:
1122
1193
  cursor.execute(f"DROP TABLE {backup_table_quoted}")
@@ -1127,7 +1198,7 @@ class MySQLDeduplicator:
1127
1198
  except Exception as e:
1128
1199
  logger.error('id重排异常,准备回滚', {"库": database, "表": table, "异常": str(e)})
1129
1200
  # 回滚:如临时表存在则删掉,恢复原表结构
1130
- with self._get_connection() as conn:
1201
+ with self._conn_ctx() as conn:
1131
1202
  with conn.cursor() as cursor:
1132
1203
  try:
1133
1204
  cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
@@ -1135,7 +1206,7 @@ class MySQLDeduplicator:
1135
1206
  logger.error('回滚时删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
1136
1207
  # 恢复原表(如备份表存在)
1137
1208
  try:
1138
- with self._get_connection() as conn2:
1209
+ with self._conn_ctx() as conn2:
1139
1210
  with conn2.cursor() as cursor2:
1140
1211
  if self._check_table_exists(database, backup_table):
1141
1212
  cursor2.execute(f"DROP TABLE IF EXISTS {table_quoted}")
@@ -1227,23 +1298,16 @@ def main():
1227
1298
  batch_size=1000,
1228
1299
  skip_system_dbs=True,
1229
1300
  max_retries=3,
1230
- retry_interval=5,
1231
- pool_size=5,
1301
+ retry_waiting_time=5,
1302
+ # pool_size=30,
1232
1303
  recent_month=1,
1233
1304
  # date_range=['2025-06-09', '2025-06-10'],
1234
1305
  date_column='日期',
1235
- exclude_columns=None,
1236
1306
  exclude_databases=['测试库4'],
1237
1307
  exclude_tables={
1238
1308
  '推广数据2': [
1239
1309
  '地域报表_城市_2025_04',
1240
- '地域报表_城市_2025_05',
1241
- '地域报表_城市_2025_06',
1242
1310
  # '地域报表_城市_2025_04_copy1',
1243
- # '地域报表_城市_2025_05_copy1',
1244
- # '地域报表_城市_2025_06_copy1',
1245
- '奥莱店_主体报表',
1246
- # '奥莱店_主体报表_copy1',
1247
1311
  ],
1248
1312
  "生意参谋3": [
1249
1313
  "商品排行_2025",
@@ -1255,10 +1319,10 @@ def main():
1255
1319
  deduplicator.deduplicate_all(dry_run=False, parallel=True, reorder_id=True)
1256
1320
 
1257
1321
  # # 指定数据库去重(多线程)
1258
- # deduplicator.deduplicate_database('推广数据2', dry_run=False, parallel=True, reorder_id=True)
1322
+ # deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True, reorder_id=True)
1259
1323
 
1260
1324
  # # 指定表去重(使用特定列)
1261
- # deduplicator.deduplicate_table('推广数据2', '地域报表_城市_2025_06_copy1', columns=[], dry_run=False, reorder_id=True)
1325
+ # deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'data'], dry_run=False, reorder_id=True)
1262
1326
 
1263
1327
  # # 重排id列
1264
1328
  # deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
mdbq/mysql/uploader.py CHANGED
@@ -23,8 +23,8 @@ logger = mylogger.MyLogger(
23
23
  max_log_size=50,
24
24
  backup_count=5,
25
25
  enable_async=False, # 是否启用异步日志
26
- sample_rate=1, # 采样50%的DEBUG/INFO日志
27
- sensitive_fields=[], # 敏感字段列表
26
+ sample_rate=1, # 采样DEBUG/INFO日志, 0.5表示50%的日志会被采样
27
+ sensitive_fields=[], # 过滤敏感字段列表
28
28
  )
29
29
 
30
30
 
@@ -83,7 +83,7 @@ class MySQLUploader:
83
83
  charset: str = 'utf8mb4',
84
84
  collation: str = 'utf8mb4_0900_ai_ci',
85
85
  max_retries: int = 10,
86
- retry_interval: int = 10,
86
+ retry_waiting_time: int = 10,
87
87
  pool_size: int = 5,
88
88
  connect_timeout: int = 10,
89
89
  read_timeout: int = 30,
@@ -100,7 +100,7 @@ class MySQLUploader:
100
100
  :param charset: 字符集,默认为utf8mb4
101
101
  :param collation: 排序规则,默认为utf8mb4_0900_ai_ci,对大小写不敏感,utf8mb4_0900_as_cs/utf8mb4_bin: 对大小写敏感
102
102
  :param max_retries: 最大重试次数,默认为10
103
- :param retry_interval: 重试间隔(秒),默认为10
103
+ :param retry_waiting_time: 重试间隔(秒),默认为10
104
104
  :param pool_size: 连接池大小,默认为5
105
105
  :param connect_timeout: 连接超时(秒),默认为10
106
106
  :param read_timeout: 读取超时(秒),默认为30
@@ -114,7 +114,7 @@ class MySQLUploader:
114
114
  self.charset = charset
115
115
  self.collation = collation
116
116
  self.max_retries = max(max_retries, 1)
117
- self.retry_interval = max(retry_interval, 1)
117
+ self.retry_waiting_time = max(retry_waiting_time, 1)
118
118
  self.pool_size = max(pool_size, 1)
119
119
  self.connect_timeout = connect_timeout
120
120
  self.read_timeout = read_timeout
@@ -169,7 +169,7 @@ class MySQLUploader:
169
169
  }
170
170
  try:
171
171
  pool = PooledDB(**pool_params)
172
- logger.info('连接池创建成功', {'连接池': self.pool_size, 'host': self.host, 'port': self.port})
172
+ logger.debug('连接池创建成功', {'连接池': self.pool_size, 'host': self.host, 'port': self.port})
173
173
  return pool
174
174
  except Exception as e:
175
175
  self.pool = None
@@ -188,14 +188,11 @@ class MySQLUploader:
188
188
  def wrapper(self, *args, **kwargs):
189
189
  last_exception = None
190
190
  operation = func.__name__
191
- logger.debug(f'开始执行操作: {operation}', {'max_retries': self.max_retries})
192
191
  for attempt in range(self.max_retries):
193
192
  try:
194
193
  result = func(self, *args, **kwargs)
195
194
  if attempt > 0:
196
195
  logger.info('操作成功(重试后)', {'operation': operation, 'attempts': attempt + 1})
197
- else:
198
- logger.debug('操作成功', {'operation': operation})
199
196
  return result
200
197
  except (pymysql.OperationalError, pymysql.err.MySQLError) as e:
201
198
  last_exception = e
@@ -207,7 +204,7 @@ class MySQLUploader:
207
204
  'max_retries': self.max_retries
208
205
  }
209
206
  if attempt < self.max_retries - 1:
210
- wait_time = self.retry_interval * (attempt + 1)
207
+ wait_time = self.retry_waiting_time * (attempt + 1)
211
208
  error_details['wait_time'] = wait_time
212
209
  logger.warning('数据库操作失败,准备重试', error_details)
213
210
  time.sleep(wait_time)
@@ -218,13 +215,6 @@ class MySQLUploader:
218
215
  logger.error('重连失败', {'error': str(reconnect_error)})
219
216
  else:
220
217
  logger.error('操作最终失败', error_details)
221
- except pymysql.IntegrityError as e:
222
- logger.error('完整性约束错误', {
223
- 'operation': operation,
224
- 'error_code': e.args[0] if e.args else None,
225
- 'error_message': e.args[1] if len(e.args) > 1 else None
226
- })
227
- raise e
228
218
  except Exception as e:
229
219
  last_exception = e
230
220
  logger.error('发生意外错误', {
@@ -247,10 +237,9 @@ class MySQLUploader:
247
237
  """
248
238
  try:
249
239
  conn = self.pool.connection()
250
- logger.debug('获取数据库连接', {'host': self.host, 'port': self.port})
251
240
  return conn
252
241
  except Exception as e:
253
- logger.error('获取数据库连接失败', {'error': str(e)})
242
+ logger.error('从连接池获取数据库连接失败', {'error': str(e)})
254
243
  raise ConnectionError(f'连接数据库失败: {str(e)}')
255
244
 
256
245
  @_execute_with_retry
@@ -392,7 +381,8 @@ class MySQLUploader:
392
381
  primary_keys: Optional[List[str]] = None,
393
382
  date_column: Optional[str] = None,
394
383
  indexes: Optional[List[str]] = None,
395
- allow_null: bool = False
384
+ allow_null: bool = False,
385
+ unique_keys: Optional[List[List[str]]] = None
396
386
  ) -> None:
397
387
  """
398
388
  创建数据表,优化索引创建方式
@@ -402,39 +392,48 @@ class MySQLUploader:
402
392
  if not set_typ:
403
393
  logger.error('建表时未指定set_typ', {'库': db_name, '表': table_name})
404
394
  raise ValueError('set_typ 未指定')
395
+ # set_typ的键清洗
396
+ set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
405
397
  column_defs = ["`id` INT NOT NULL AUTO_INCREMENT"]
406
398
  for col_name, col_type in set_typ.items():
407
- if col_name.lower() == 'id':
399
+ if col_name == 'id':
408
400
  continue
409
- safe_col_name = self._validate_identifier(col_name)
401
+ safe_col_name = self._normalize_col(col_name)
410
402
  col_def = f"`{safe_col_name}` {col_type}"
411
403
  if not allow_null and not col_type.lower().startswith('json'):
412
404
  col_def += " NOT NULL"
413
405
  column_defs.append(col_def)
414
- if primary_keys:
415
- if 'id' not in [pk.lower() for pk in primary_keys]:
416
- primary_keys = ['id'] + primary_keys
406
+ # 主键处理逻辑调整
407
+ if primary_keys and len(primary_keys) > 0:
408
+ safe_primary_keys = [self._normalize_col(pk) for pk in primary_keys]
409
+ primary_key_sql = f"PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
417
410
  else:
418
- primary_keys = ['id']
419
- safe_primary_keys = [self._validate_identifier(pk) for pk in primary_keys]
420
- primary_key_sql = f", PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
411
+ safe_primary_keys = [self._normalize_col('id')]
412
+ primary_key_sql = f"PRIMARY KEY (`id`)"
421
413
  # 索引统一在CREATE TABLE中定义
422
414
  index_defs = []
423
415
  if date_column and date_column in set_typ:
424
- safe_date_col = self._validate_identifier(date_column)
416
+ safe_date_col = self._normalize_col(date_column)
425
417
  index_defs.append(f"INDEX `idx_{safe_date_col}` (`{safe_date_col}`)")
426
418
  if indexes:
427
419
  for idx_col in indexes:
428
420
  if idx_col in set_typ:
429
- safe_idx_col = self._validate_identifier(idx_col)
421
+ safe_idx_col = self._normalize_col(idx_col)
430
422
  index_defs.append(f"INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)")
423
+ # UNIQUE KEY定义
424
+ unique_defs = []
425
+ if unique_keys:
426
+ for idx, unique_cols in enumerate(unique_keys):
427
+ if not unique_cols:
428
+ continue
429
+ safe_unique_cols = [self._normalize_col(col) for col in unique_cols]
430
+ unique_name = f"uniq_{'_'.join(safe_unique_cols)}_{idx}"
431
+ unique_defs.append(f"UNIQUE KEY `{unique_name}` (`{'`,`'.join(safe_unique_cols)}`)")
431
432
  index_defs = list(set(index_defs))
432
- index_sql = (',' + ','.join(index_defs)) if index_defs else ''
433
+ all_defs = column_defs + [primary_key_sql] + index_defs + unique_defs
433
434
  sql = f"""
434
435
  CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
435
- {','.join(column_defs)}
436
- {primary_key_sql}
437
- {index_sql}
436
+ {','.join(all_defs)}
438
437
  ) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
439
438
  """
440
439
  conn = None
@@ -443,7 +442,7 @@ class MySQLUploader:
443
442
  with conn.cursor() as cursor:
444
443
  cursor.execute(sql)
445
444
  conn.commit()
446
- logger.info('数据表及索引已创建', {'库': db_name, '表': table_name, '索引': indexes})
445
+ logger.info('数据表及索引已创建', {'库': db_name, '表': table_name, '索引': indexes, '唯一约束': unique_keys})
447
446
  except Exception as e:
448
447
  logger.error('建表失败', {'库': db_name, '表': table_name, '错误': str(e)})
449
448
  if conn is not None:
@@ -476,11 +475,9 @@ class MySQLUploader:
476
475
  try:
477
476
  if date_type:
478
477
  result = pd.to_datetime(datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d'))
479
- logger.debug('日期格式化成功', {'原始': value, '格式': fmt, '结果': str(result)})
480
478
  return result
481
479
  else:
482
480
  result = datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d %H:%M:%S')
483
- logger.debug('日期格式化成功', {'原始': value, '格式': fmt, '结果': str(result)})
484
481
  return result
485
482
  except ValueError:
486
483
  continue
@@ -613,7 +610,7 @@ class MySQLUploader:
613
610
  cursor.execute(sql_check, (db_name, table_name, column))
614
611
  exists = cursor.fetchone()
615
612
  if exists and list(exists.values())[0] > 0:
616
- logger.debug('索引已存在', {'库': db_name, '表': table_name, '': column})
613
+ logger.debug('索引检查', {'库': db_name, '表': table_name, '索引列': column})
617
614
  return
618
615
  cursor.execute(sql_create)
619
616
  conn.commit()
@@ -622,6 +619,49 @@ class MySQLUploader:
622
619
  logger.error('创建索引失败', {'库': db_name, '表': table_name, '列': column, '错误': str(e)})
623
620
  raise
624
621
 
622
+ def _get_existing_unique_keys(self, db_name: str, table_name: str) -> List[List[str]]:
623
+ """
624
+ 获取表中所有UNIQUE KEY的列组合(不含主键)。
625
+ 返回:[[col1, col2], ...]
626
+ """
627
+ db_name = self._validate_identifier(db_name)
628
+ table_name = self._validate_identifier(table_name)
629
+ sql = '''
630
+ SELECT INDEX_NAME, COLUMN_NAME
631
+ FROM INFORMATION_SCHEMA.STATISTICS
632
+ WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND NON_UNIQUE = 0 AND INDEX_NAME != 'PRIMARY'
633
+ ORDER BY INDEX_NAME, SEQ_IN_INDEX
634
+ '''
635
+ unique_map = {}
636
+ try:
637
+ with self._get_connection() as conn:
638
+ with conn.cursor() as cursor:
639
+ cursor.execute(sql, (db_name, table_name))
640
+ for row in cursor.fetchall():
641
+ idx = row['INDEX_NAME']
642
+ col = row['COLUMN_NAME']
643
+ unique_map.setdefault(idx, []).append(col)
644
+ except Exception as e:
645
+ logger.warning('获取UNIQUE KEY信息失败', {'库': db_name, '表': table_name, '错误': str(e)})
646
+ # 只返回列名组合,全部清洗小写
647
+ return [[self._normalize_col(c) for c in cols] for cols in unique_map.values() if cols]
648
+
649
+ def _add_unique_key(self, db_name: str, table_name: str, unique_cols: List[str]):
650
+ """
651
+ 添加UNIQUE KEY
652
+ """
653
+ safe_cols = [self._normalize_col(col) for col in unique_cols]
654
+ unique_name = f"uniq_{'_'.join(safe_cols)}_{int(time.time()*1000)%100000}"
655
+ sql = f'ALTER TABLE `{db_name}`.`{table_name}` ADD UNIQUE KEY `{unique_name}` ({','.join(f'`{col}`' for col in safe_cols)})'
656
+ try:
657
+ with self._get_connection() as conn:
658
+ with conn.cursor() as cursor:
659
+ cursor.execute(sql)
660
+ conn.commit()
661
+ logger.info('添加唯一约束列成功', {'库': db_name, '表': table_name, '列': unique_cols})
662
+ except Exception as e:
663
+ logger.warning('唯一约束列添加失败', {'库': db_name, '表': table_name, '列': unique_cols, '错误': str(e)})
664
+
625
665
  def _upload_to_table(
626
666
  self,
627
667
  db_name: str,
@@ -637,14 +677,15 @@ class MySQLUploader:
637
677
  indexes: Optional[List[str]],
638
678
  batch_id: Optional[str] = None,
639
679
  update_on_duplicate: bool = False,
640
- transaction_mode: str = "batch"
680
+ transaction_mode: str = "batch",
681
+ unique_keys: Optional[List[List[str]]] = None
641
682
  ):
642
683
  """实际执行表上传的方法"""
643
- # 检查表是否存在
644
- if not self._check_table_exists(db_name, table_name):
684
+ table_existed = self._check_table_exists(db_name, table_name)
685
+ if not table_existed:
645
686
  if auto_create:
646
687
  self._create_table(db_name, table_name, set_typ, primary_keys, date_column, indexes,
647
- allow_null=allow_null)
688
+ allow_null=allow_null, unique_keys=unique_keys)
648
689
  else:
649
690
  logger.error('数据表不存在', {
650
691
  '库': db_name,
@@ -652,8 +693,30 @@ class MySQLUploader:
652
693
  'func': sys._getframe().f_code.co_name,
653
694
  })
654
695
  raise ValueError(f"数据表不存在: `{db_name}`.`{table_name}`")
655
-
656
- # 获取表结构并验证
696
+ if table_existed and unique_keys:
697
+ try:
698
+ exist_ukeys = self._get_existing_unique_keys(db_name, table_name)
699
+ exist_ukeys_norm = [sorted([c.lower() for c in uk]) for uk in exist_ukeys]
700
+ filtered_ukeys = [uk for uk in unique_keys if 1 <= len(uk) <= 20]
701
+ to_add = []
702
+ for uk in filtered_ukeys:
703
+ norm_uk = sorted([c.lower() for c in uk])
704
+ if norm_uk not in exist_ukeys_norm:
705
+ to_add.append(uk)
706
+ max_unique_keys = 10
707
+ if len(exist_ukeys) + len(to_add) > max_unique_keys:
708
+ logger.warning('unique_keys超限', {
709
+ '库': db_name,
710
+ '表': table_name,
711
+ '已存在': exist_ukeys,
712
+ '本次待添加': to_add,
713
+ '最大数量': max_unique_keys
714
+ })
715
+ to_add = to_add[:max_unique_keys - len(exist_ukeys)]
716
+ for uk in to_add:
717
+ self._add_unique_key(db_name, table_name, uk)
718
+ except Exception as e:
719
+ logger.warning('动态unique key处理异常', {'库': db_name, '表': table_name, '错误': str(e)})
657
720
  table_columns = self._get_table_columns(db_name, table_name)
658
721
  if not table_columns:
659
722
  logger.error('获取列失败', {
@@ -663,8 +726,6 @@ class MySQLUploader:
663
726
  'func': sys._getframe().f_code.co_name,
664
727
  })
665
728
  raise ValueError(f"获取列失败 `{db_name}`.`{table_name}`")
666
-
667
- # 验证数据列与表列匹配
668
729
  for col in set_typ:
669
730
  if col not in table_columns:
670
731
  logger.error('列不存在', {
@@ -674,22 +735,19 @@ class MySQLUploader:
674
735
  'func': sys._getframe().f_code.co_name,
675
736
  })
676
737
  raise ValueError(f"列不存在: `{col}` -> `{db_name}`.`{table_name}`")
677
-
678
- # 确保分表参考字段为索引
679
738
  if date_column and date_column in table_columns:
680
739
  try:
681
740
  self._ensure_index(db_name, table_name, date_column)
682
741
  except Exception as e:
683
742
  logger.warning('分表参考字段索引创建失败', {'库': db_name, '表': table_name, '列': date_column, '错误': str(e)})
684
-
685
- # 插入数据
686
- self._insert_data(
743
+ inserted, skipped, failed = self._insert_data(
687
744
  db_name, table_name, data, set_typ,
688
745
  check_duplicate, duplicate_columns,
689
746
  batch_id=batch_id,
690
747
  update_on_duplicate=update_on_duplicate,
691
748
  transaction_mode=transaction_mode
692
749
  )
750
+ return inserted, skipped, failed
693
751
 
694
752
  def _infer_data_type(self, value: Any, no_log: bool = False) -> str:
695
753
  """
@@ -817,11 +875,8 @@ class MySQLUploader:
817
875
  # 统一处理原始数据中列名的特殊字符
818
876
  data = self.normalize_column_names(data)
819
877
 
820
- # set_typ的键处理
821
- if self.case_sensitive:
822
- set_typ = {k: v for k, v in set_typ.items()}
823
- else:
824
- set_typ = {k.lower(): v for k, v in set_typ.items()}
878
+ # set_typ的键清洗
879
+ set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
825
880
 
826
881
  # 获取数据中实际存在的列名
827
882
  data_columns = set()
@@ -890,7 +945,8 @@ class MySQLUploader:
890
945
  auto_create: bool = True,
891
946
  indexes: Optional[List[str]] = None,
892
947
  update_on_duplicate: bool = False,
893
- transaction_mode: str = "batch"
948
+ transaction_mode: str = "batch",
949
+ unique_keys: Optional[List[List[str]]] = None
894
950
  ):
895
951
  """
896
952
  上传数据到数据库的主入口方法,分表逻辑异常处理统计丢弃数据
@@ -912,6 +968,7 @@ class MySQLUploader:
912
968
  - 'row' : 逐行提交事务(错误隔离性好)
913
969
  - 'batch' : 整批提交事务(性能最优)
914
970
  - 'hybrid' : 混合模式(每N行提交,平衡性能与安全性)
971
+ :param unique_keys: 唯一约束列表,每个元素为列名列表,支持多列组合唯一约束
915
972
  :raises: 可能抛出各种验证和数据库相关异常
916
973
  """
917
974
  # upload_start = time.time()
@@ -936,7 +993,8 @@ class MySQLUploader:
936
993
  # '自动建表': auto_create,
937
994
  '索引': indexes,
938
995
  '更新旧数据': update_on_duplicate,
939
- '事务模式': transaction_mode
996
+ '事务模式': transaction_mode,
997
+ '唯一约束': unique_keys
940
998
  },
941
999
  # '数据样例': self._shorten_for_log(data, 2)
942
1000
  })
@@ -1005,15 +1063,21 @@ class MySQLUploader:
1005
1063
  continue
1006
1064
 
1007
1065
  # 对每个分表执行上传
1066
+ total_inserted = 0
1067
+ total_skipped = dropped_rows # 分表异常丢弃
1068
+ total_failed = 0
1008
1069
  for part_table, part_data in partitioned_data.items():
1009
1070
  try:
1010
- self._upload_to_table(
1071
+ inserted, skipped, failed = self._upload_to_table(
1011
1072
  db_name, part_table, part_data, filtered_set_typ,
1012
1073
  primary_keys, check_duplicate, duplicate_columns,
1013
1074
  allow_null, auto_create, partition_date_column,
1014
- indexes, batch_id, update_on_duplicate, transaction_mode
1075
+ indexes, batch_id, update_on_duplicate, transaction_mode,
1076
+ unique_keys
1015
1077
  )
1016
- # 确保分表参考字段为索引
1078
+ total_inserted += inserted
1079
+ total_skipped += skipped
1080
+ total_failed += failed
1017
1081
  if partition_date_column in filtered_set_typ:
1018
1082
  try:
1019
1083
  self._ensure_index(db_name, part_table, partition_date_column)
@@ -1031,13 +1095,16 @@ class MySQLUploader:
1031
1095
  continue # 跳过当前分表,继续处理其他分表
1032
1096
  else:
1033
1097
  # 不分表,直接上传
1034
- self._upload_to_table(
1098
+ inserted, skipped, failed = self._upload_to_table(
1035
1099
  db_name, table_name, prepared_data, filtered_set_typ,
1036
1100
  primary_keys, check_duplicate, duplicate_columns,
1037
1101
  allow_null, auto_create, partition_date_column,
1038
- indexes, batch_id, update_on_duplicate, transaction_mode
1102
+ indexes, batch_id, update_on_duplicate, transaction_mode,
1103
+ unique_keys
1039
1104
  )
1040
- # 确保分表参考字段为索引
1105
+ total_inserted = inserted
1106
+ total_skipped = skipped
1107
+ total_failed = failed
1041
1108
  if partition_date_column in filtered_set_typ:
1042
1109
  try:
1043
1110
  self._ensure_index(db_name, table_name, partition_date_column)
@@ -1062,7 +1129,9 @@ class MySQLUploader:
1062
1129
  '批次': batch_id,
1063
1130
  'finish': success_flag,
1064
1131
  '数据行': initial_row_count,
1065
- '丢弃行数': dropped_rows
1132
+ '插入': total_inserted,
1133
+ '跳过': total_skipped,
1134
+ '失败': total_failed
1066
1135
  })
1067
1136
 
1068
1137
  @_execute_with_retry
@@ -1095,26 +1164,19 @@ class MySQLUploader:
1095
1164
  - 'hybrid' : 混合模式(每N行提交,平衡性能与安全性)
1096
1165
  """
1097
1166
  if not data:
1098
- return
1099
-
1100
- # 验证事务模式
1167
+ return 0, 0, 0
1101
1168
  transaction_mode = self._validate_transaction_mode(transaction_mode)
1102
-
1103
- # 准备SQL语句
1104
1169
  sql = self._prepare_insert_sql(
1105
1170
  db_name, table_name, set_typ,
1106
1171
  check_duplicate, duplicate_columns,
1107
1172
  update_on_duplicate
1108
1173
  )
1109
-
1110
- # 执行批量插入
1111
1174
  total_inserted, total_skipped, total_failed = self._execute_batch_insert(
1112
1175
  db_name, table_name, data, set_typ,
1113
1176
  sql, check_duplicate, duplicate_columns,
1114
1177
  batch_id, transaction_mode,
1115
1178
  update_on_duplicate
1116
1179
  )
1117
-
1118
1180
  logger.info('插入完成', {
1119
1181
  '库': db_name,
1120
1182
  '表': table_name,
@@ -1124,6 +1186,7 @@ class MySQLUploader:
1124
1186
  '失败': total_failed,
1125
1187
  '事务模式': transaction_mode,
1126
1188
  })
1189
+ return total_inserted, total_skipped, total_failed
1127
1190
 
1128
1191
  def _validate_transaction_mode(self, mode: str) -> str:
1129
1192
  """验证并标准化事务模式"""
@@ -1266,6 +1329,7 @@ class MySQLUploader:
1266
1329
  update_on_duplicate: bool = False
1267
1330
  ) -> Tuple[int, int, int]:
1268
1331
  """执行批量插入操作,优化batch和hybrid模式"""
1332
+ import pymysql # 确保异常类型可用
1269
1333
  def get_optimal_batch_size(total_rows: int) -> int:
1270
1334
  if total_rows <= 100:
1271
1335
  return total_rows
@@ -1295,7 +1359,13 @@ class MySQLUploader:
1295
1359
  try:
1296
1360
  cursor.executemany(sql, values_list)
1297
1361
  conn.commit()
1298
- total_inserted += len(batch)
1362
+ inserted = cursor.rowcount if cursor.rowcount is not None else 0
1363
+ total_inserted += inserted
1364
+ total_skipped += len(batch) - inserted
1365
+ except pymysql.err.IntegrityError as e:
1366
+ conn.rollback()
1367
+ total_skipped += len(batch)
1368
+ logger.debug('批量插入唯一约束冲突,全部跳过', {'库': db_name, '表': table_name, '错误': str(e)})
1299
1369
  except Exception as e:
1300
1370
  conn.rollback()
1301
1371
  total_failed += len(batch)
@@ -1311,7 +1381,15 @@ class MySQLUploader:
1311
1381
  dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
1312
1382
  values += [row.get(col) for col in dup_cols]
1313
1383
  cursor.execute(sql, values)
1314
- total_inserted += 1
1384
+ affected = cursor.rowcount if cursor.rowcount is not None else 0
1385
+ if affected > 0:
1386
+ total_inserted += 1
1387
+ else:
1388
+ total_skipped += 1
1389
+ except pymysql.err.IntegrityError as e:
1390
+ conn.rollback()
1391
+ total_skipped += 1
1392
+ logger.debug('hybrid单行插入唯一约束冲突,跳过', {'库': db_name, '表': table_name, '错误': str(e)})
1315
1393
  except Exception as e:
1316
1394
  conn.rollback()
1317
1395
  total_failed += 1
@@ -1325,8 +1403,16 @@ class MySQLUploader:
1325
1403
  dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
1326
1404
  values += [row.get(col) for col in dup_cols]
1327
1405
  cursor.execute(sql, values)
1406
+ affected = cursor.rowcount if cursor.rowcount is not None else 0
1407
+ if affected > 0:
1408
+ total_inserted += 1
1409
+ else:
1410
+ total_skipped += 1
1328
1411
  conn.commit()
1329
- total_inserted += 1
1412
+ except pymysql.err.IntegrityError as e:
1413
+ conn.rollback()
1414
+ total_skipped += 1
1415
+ logger.debug('单行插入唯一约束冲突,跳过', {'库': db_name, '表': table_name, '错误': str(e)})
1330
1416
  except Exception as e:
1331
1417
  conn.rollback()
1332
1418
  total_failed += 1
@@ -1347,9 +1433,9 @@ class MySQLUploader:
1347
1433
  self.pool = None
1348
1434
  except Exception as e:
1349
1435
  logger.warning('关闭连接池时出错', {'error': str(e)})
1350
- logger.info('连接池关闭', {'uploader.py': '连接池关闭'})
1436
+ logger.debug('finished', {'uploader.py': '连接池关闭'})
1351
1437
  except Exception as e:
1352
- logger.error('关闭连接池失败', {'error': str(e)})
1438
+ logger.error('关闭连接池失败', {'uploader.py': str(e)})
1353
1439
  raise
1354
1440
 
1355
1441
  def _check_pool_health(self) -> bool:
@@ -1431,6 +1517,13 @@ class MySQLUploader:
1431
1517
  def __exit__(self, exc_type, exc_val, exc_tb):
1432
1518
  self.close()
1433
1519
 
1520
+ def _normalize_col(self, col: str) -> str:
1521
+ """
1522
+ 列名自动清洗并转小写(如case_sensitive为False),保证和表结构一致。
1523
+ """
1524
+ safe = self._validate_identifier(col)
1525
+ return safe if self.case_sensitive else safe.lower()
1526
+
1434
1527
 
1435
1528
  def main():
1436
1529
  """
@@ -1443,7 +1536,7 @@ def main():
1443
1536
  """
1444
1537
  uploader = MySQLUploader(
1445
1538
  username='root',
1446
- password='pw',
1539
+ password='pwd',
1447
1540
  host='localhost',
1448
1541
  port=3306,
1449
1542
  )
@@ -1462,7 +1555,7 @@ def main():
1462
1555
  {'日期': '2023-01-8', 'name': 'JACk', 'AGE': '24', 'salary': 555.1545},
1463
1556
  {'日期': '2023-01-15', 'name': 'Alice', 'AGE': 35, 'salary': '100'},
1464
1557
  {'日期': '2023-01-15', 'name': 'Alice', 'AGE': 30, 'salary': 0.0},
1465
- {'日期': '2023-02-20', 'name': 'Bob', 'AGE': 25, 'salary': 45000.75}
1558
+ {'日期': '2023-02-20', 'name': 'Bob', 'AGE': 25, 'salary': 45000.75},
1466
1559
  ]
1467
1560
 
1468
1561
  # 上传数据
@@ -1474,12 +1567,13 @@ def main():
1474
1567
  primary_keys=[], # 创建唯一主键
1475
1568
  check_duplicate=False, # 检查重复数据
1476
1569
  duplicate_columns=[], # 指定排重的组合键
1570
+ update_on_duplicate=False, # 更新旧数据
1477
1571
  allow_null=False, # 允许插入空值
1478
- partition_by='year', # 按月分表
1572
+ partition_by='year', # 分表方式
1479
1573
  partition_date_column='日期', # 用于分表的日期列名,默认为'日期'
1480
- auto_create=True, # 表不存在时自动创建, 默认参数不要更改
1481
- indexes=[], # 指定索引列
1574
+ indexes=[], # 普通索引列
1482
1575
  transaction_mode='row', # 事务模式
1576
+ unique_keys=[['日期', 'name', 'age']] # 唯一约束列表
1483
1577
  )
1484
1578
 
1485
1579
  uploader.close()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 3.11.11
3
+ Version: 3.12.1
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,5 +1,5 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=GrY3av2BYeEaosI2qWYizQyTwyijdq8IuOuFjTJqLxE,19
2
+ mdbq/__version__.py,sha256=vHfePSxiigIQg58VIYYk2QYh_4AtpXtMsfV3nHXNUhg,18
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
4
  mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
5
5
  mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
@@ -8,10 +8,10 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
8
8
  mdbq/log/mylogger.py,sha256=Crw6LwVo3I3IUbzIETu8f46Quza3CTCh-qYf4edbBPo,24139
9
9
  mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
10
10
  mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
11
- mdbq/mysql/deduplicator.py,sha256=e84MLhWjdCoDB8GxUV-z5drn8hdKGlJKnHzNW0rjIM8,65345
11
+ mdbq/mysql/deduplicator.py,sha256=KMJ_YyqAniaLVRqOHLgO92PgwknIDB-EgaOY7S6iMZ4,68599
12
12
  mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
13
13
  mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
14
- mdbq/mysql/uploader.py,sha256=8Px_W2bYOr1wQgMXMK0DggNiuE6a6Ul4BlJake8LSo8,64469
14
+ mdbq/mysql/uploader.py,sha256=PD8gA2PixoK2ZH4vWTmz1kbNTab8VGUJLoepD024H5Q,70265
15
15
  mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
16
16
  mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
17
17
  mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
24
24
  mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
25
25
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
26
26
  mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
27
- mdbq-3.11.11.dist-info/METADATA,sha256=NHTu8tsBwtvh90jaiNN4E4i9SW5xkH6P-yYcBrxwSbU,365
28
- mdbq-3.11.11.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
- mdbq-3.11.11.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
- mdbq-3.11.11.dist-info/RECORD,,
27
+ mdbq-3.12.1.dist-info/METADATA,sha256=viVkeKnHLlpvAxthu_c50VYyla5Uc2COG99IigfDPmc,364
28
+ mdbq-3.12.1.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
+ mdbq-3.12.1.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
+ mdbq-3.12.1.dist-info/RECORD,,
File without changes