mdbq 3.9.2__py3-none-any.whl → 3.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.9.2'
1
+ VERSION = '3.9.4'
mdbq/mysql/mysql.py CHANGED
@@ -12,10 +12,14 @@ import os
12
12
  import logging
13
13
  import logging.handlers
14
14
  from mdbq.other import otk
15
- from typing import Union, List, Dict, Optional, Any, Tuple
15
+ from typing import Union, List, Dict, Optional, Any, Tuple, Set
16
16
  from dbutils.pooled_db import PooledDB
17
17
  import json
18
- import psutil # 用于监控资源使用情况
18
+ import psutil
19
+ from collections import OrderedDict
20
+ import threading
21
+ import concurrent.futures
22
+ from collections import defaultdict
19
23
 
20
24
 
21
25
  warnings.filterwarnings('ignore')
@@ -1111,6 +1115,18 @@ class OptimizeDatas:
1111
1115
  self.connection.close()
1112
1116
 
1113
1117
 
1118
+ class StatementCache(OrderedDict):
1119
+ """LRU缓存策略"""
1120
+ def __init__(self, maxsize=100):
1121
+ super().__init__()
1122
+ self.maxsize = maxsize
1123
+
1124
+ def __setitem__(self, key, value):
1125
+ super().__setitem__(key, value)
1126
+ if len(self) > self.maxsize:
1127
+ self.popitem(last=False)
1128
+
1129
+
1114
1130
  class MySQLUploader:
1115
1131
  def __init__(
1116
1132
  self,
@@ -1119,7 +1135,7 @@ class MySQLUploader:
1119
1135
  host: str = 'localhost',
1120
1136
  port: int = 3306,
1121
1137
  charset: str = 'utf8mb4',
1122
- collation: str = 'utf8mb4_0900_ai_ci',
1138
+ collation: str = 'utf8mb4_0900_ai_ci', # utf8mb4_0900_ai_ci: 该排序规则对大小写不敏感, utf8mb4_0900_as_cs/utf8mb4_bin: 对大小写敏感
1123
1139
  logging_mode: str = 'console', # 'both'(控制台+文件), 'console'(仅控制台), 'file'(仅文件), 'none'(禁用)
1124
1140
  log_level: str = 'INFO', # 默认日志级别
1125
1141
  log_file: str = 'mysql_upload.log', # 日志文件路径
@@ -1127,7 +1143,7 @@ class MySQLUploader:
1127
1143
  backup_count: int = 5, # 保留的日志文件数量
1128
1144
  max_retries: int = 10,
1129
1145
  retry_interval: int = 10,
1130
- pool_size: int = 10,
1146
+ pool_size: int = 5,
1131
1147
  connect_timeout: int = 10,
1132
1148
  read_timeout: int = 30,
1133
1149
  write_timeout: int = 30,
@@ -1168,7 +1184,7 @@ class MySQLUploader:
1168
1184
  self.read_timeout = read_timeout
1169
1185
  self.write_timeout = write_timeout
1170
1186
  self.ssl = ssl
1171
- self._prepared_statements = {}
1187
+ self._prepared_statements = StatementCache(maxsize=100)
1172
1188
  self._max_cached_statements = 100
1173
1189
  self.enable_metrics = enable_metrics
1174
1190
  self.metrics = {
@@ -1184,6 +1200,11 @@ class MySQLUploader:
1184
1200
  'memory_usage': [],
1185
1201
  'cpu_usage': []
1186
1202
  }
1203
+ self._last_metrics_time = 0
1204
+ self._metrics_cache = {} # 缓存最近一次的系统指标
1205
+ self.metrics_interval = 30 # 指标采集频率控制
1206
+ self._table_metadata_cache = {} # 元信息缓存
1207
+ self.metadata_cache_ttl = 300 # 元信息缓存频率控制
1187
1208
 
1188
1209
  # 初始化日志系统
1189
1210
  self._init_logging(logging_mode, log_level, log_file, max_log_size, backup_count)
@@ -1224,6 +1245,10 @@ class MySQLUploader:
1224
1245
  if record.exc_info:
1225
1246
  log_data['exception'] = self.formatException(record.exc_info)
1226
1247
 
1248
+ # 过滤敏感信息
1249
+ if hasattr(record, 'password'):
1250
+ log_data['message'] = log_data['message'].replace(self.password, '***')
1251
+
1227
1252
  return json.dumps(log_data, ensure_ascii=False)
1228
1253
 
1229
1254
  # 创建日志记录器
@@ -1259,6 +1284,11 @@ class MySQLUploader:
1259
1284
  if not self.enable_metrics:
1260
1285
  return
1261
1286
 
1287
+ # 对于频繁调用的指标,使用更高效的数据结构
1288
+ if metric_name in ('total_uploads', 'successful_uploads', 'failed_uploads'):
1289
+ self.metrics[metric_name] = self.metrics.get(metric_name, 0) + value
1290
+ return
1291
+
1262
1292
  if metric_name not in self.metrics:
1263
1293
  self.metrics[metric_name] = []
1264
1294
 
@@ -1309,9 +1339,20 @@ class MySQLUploader:
1309
1339
  if not self.logger:
1310
1340
  return
1311
1341
 
1312
- # 记录系统指标
1313
- metrics = self._get_system_metrics()
1314
- log_extra = {'metrics': metrics}
1342
+ if len(message) > 500:
1343
+ message = message[:500] + '...'
1344
+
1345
+ now = time.time()
1346
+ if now - self._last_metrics_time > self.metrics_interval:
1347
+ self._metrics_cache = self._get_system_metrics()
1348
+ # 使用缓存的指标
1349
+ log_extra = {'metrics': self._metrics_cache}
1350
+ self._last_metrics_time = now
1351
+ else:
1352
+ # 记录系统指标
1353
+ metrics = self._get_system_metrics()
1354
+ log_extra = {'metrics': metrics}
1355
+
1315
1356
  if extra:
1316
1357
  log_extra.update(extra)
1317
1358
 
@@ -1319,6 +1360,9 @@ class MySQLUploader:
1319
1360
 
1320
1361
  def _create_connection_pool(self) -> PooledDB:
1321
1362
  """创建数据库连接池"""
1363
+ if hasattr(self, 'pool') and self.pool is not None and self._check_pool_health():
1364
+ return self.pool
1365
+
1322
1366
  start_time = time.time()
1323
1367
  self.pool = None
1324
1368
 
@@ -1418,7 +1462,7 @@ class MySQLUploader:
1418
1462
  if attempt < self.max_retries - 1:
1419
1463
  wait_time = self.retry_interval * (attempt + 1)
1420
1464
  error_details['wait_time'] = wait_time
1421
- self._log_with_metrics('warning', "数据库操作失败,准备重试", error_details)
1465
+ self._log_with_metrics('warning', f"数据库操作失败,准备重试 {error_details}", )
1422
1466
  time.sleep(wait_time)
1423
1467
 
1424
1468
  # 尝试重新连接
@@ -1432,7 +1476,7 @@ class MySQLUploader:
1432
1476
  else:
1433
1477
  elapsed = time.time() - start_time
1434
1478
  error_details['time_elapsed'] = elapsed
1435
- self._log_with_metrics('error', "操作最终失败", error_details)
1479
+ self._log_with_metrics('error', f"操作最终失败 {error_details}")
1436
1480
 
1437
1481
  except pymysql.IntegrityError as e:
1438
1482
  elapsed = time.time() - start_time
@@ -1469,7 +1513,7 @@ class MySQLUploader:
1469
1513
  self._log_with_metrics('debug', "获取数据库连接")
1470
1514
  return conn
1471
1515
  except Exception as e:
1472
- self._log_with_metrics("error", str(e))
1516
+ self._log_with_metrics("error", f'{e}')
1473
1517
  raise ConnectionError(f"连接数据库失败: {str(e)}")
1474
1518
 
1475
1519
  def _check_database_exists(self, db_name: str) -> bool:
@@ -1515,12 +1559,14 @@ class MySQLUploader:
1515
1559
  :raises ValueError: 如果日期格式无效或分表方式无效
1516
1560
  """
1517
1561
  try:
1518
- date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d %H:%M:%S')
1562
+ # date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d %H:%M:%S')
1563
+ date_obj = self._validate_datetime(date_value, True)
1519
1564
  except ValueError:
1520
1565
  try:
1521
- date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d')
1566
+ # date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d')
1567
+ date_obj = self._validate_datetime(date_value, True)
1522
1568
  except ValueError:
1523
- error_msg = f"无效的日期格式: {date_value}"
1569
+ error_msg = f"无效的日期格式1: {date_value}"
1524
1570
  self._log_with_metrics('error', error_msg)
1525
1571
  raise ValueError(error_msg)
1526
1572
 
@@ -1567,24 +1613,33 @@ class MySQLUploader:
1567
1613
 
1568
1614
  def _check_table_exists(self, db_name: str, table_name: str) -> bool:
1569
1615
  """检查表是否存在"""
1616
+ cache_key = f"{db_name}.{table_name}"
1617
+ if cache_key in self._table_metadata_cache:
1618
+ cached_time, result = self._table_metadata_cache[cache_key]
1619
+ if time.time() - cached_time < self.metadata_cache_ttl:
1620
+ return result
1621
+
1570
1622
  db_name = self._validate_identifier(db_name)
1571
1623
  table_name = self._validate_identifier(table_name)
1572
1624
  sql = """
1573
- SELECT TABLE_NAME
1574
- FROM INFORMATION_SCHEMA.TABLES
1575
- WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
1576
- """
1625
+ SELECT TABLE_NAME
1626
+ FROM INFORMATION_SCHEMA.TABLES
1627
+ WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
1628
+ """
1577
1629
 
1578
1630
  try:
1579
1631
  with self._get_connection() as conn:
1580
1632
  with conn.cursor() as cursor:
1581
1633
  cursor.execute(sql, (db_name, table_name))
1582
- exists = bool(cursor.fetchone())
1583
- return exists
1634
+ result = bool(cursor.fetchone())
1584
1635
  except Exception as e:
1585
1636
  self._log_with_metrics('error', f"检查数据表是否存在时发生未知错误: {e}", )
1586
1637
  raise
1587
1638
 
1639
+ # 执行查询并缓存结果
1640
+ self._table_metadata_cache[cache_key] = (time.time(), result)
1641
+ return result
1642
+
1588
1643
  def _create_table(
1589
1644
  self,
1590
1645
  db_name: str,
@@ -1691,7 +1746,8 @@ class MySQLUploader:
1691
1746
  conn.rollback()
1692
1747
  raise
1693
1748
 
1694
- def _validate_datetime(self, value):
1749
+ def _validate_datetime(self, value, date_type=False):
1750
+ """date_type: 返回字符串类型或者日期类型"""
1695
1751
  formats = [
1696
1752
  '%Y-%m-%d %H:%M:%S',
1697
1753
  '%Y-%m-%d',
@@ -1699,14 +1755,20 @@ class MySQLUploader:
1699
1755
  '%Y/%m/%d',
1700
1756
  '%Y%m%d',
1701
1757
  '%Y-%m-%dT%H:%M:%S',
1702
- '%Y-%m-%d %H:%M:%S.%f'
1758
+ '%Y-%m-%d %H:%M:%S.%f',
1759
+ '%Y/%-m/%-d', # 2023/1/8
1760
+ '%Y-%m-%-d', # 2023-01-8
1761
+ '%Y-%-m-%-d' # 2023-1-8
1703
1762
  ]
1704
1763
  for fmt in formats:
1705
1764
  try:
1706
- return datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d %H:%M:%S')
1765
+ if date_type:
1766
+ return pd.to_datetime(datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d'))
1767
+ else:
1768
+ return datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d %H:%M:%S')
1707
1769
  except ValueError:
1708
1770
  continue
1709
- raise ValueError(f"无效的日期格式: {value}")
1771
+ raise ValueError(f"无效的日期格式2: {value}")
1710
1772
 
1711
1773
  def _validate_value(self, value: Any, column_type: str) -> Any:
1712
1774
  """
@@ -1724,7 +1786,9 @@ class MySQLUploader:
1724
1786
  column_type_lower = column_type.lower()
1725
1787
 
1726
1788
  if 'int' in column_type_lower:
1727
- return int(value) if value is not None else None
1789
+ if isinstance(value, (str, bytes)) and not value.strip().isdigit():
1790
+ raise ValueError("非数字字符串无法转换为整数")
1791
+ return int(value)
1728
1792
  elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
1729
1793
  return float(value) if value is not None else None
1730
1794
  elif '日期' in column_type_lower or 'time' in column_type_lower:
@@ -1819,6 +1883,58 @@ class MySQLUploader:
1819
1883
  check_duplicate, duplicate_columns
1820
1884
  )
1821
1885
 
1886
+ def _infer_data_type(self, value: Any) -> str:
1887
+ """
1888
+ 根据值推断合适的数据类型
1889
+
1890
+ :param value: 要推断的值
1891
+ :return: MySQL数据类型字符串
1892
+ """
1893
+ if value is None:
1894
+ return 'VARCHAR(255)' # 默认字符串类型
1895
+
1896
+ if isinstance(value, bool):
1897
+ return 'TINYINT(1)'
1898
+ elif isinstance(value, int):
1899
+ # if -128 <= value <= 127:
1900
+ # return 'TINYINT'
1901
+ # elif -32768 <= value <= 32767:
1902
+ # return 'SMALLINT'
1903
+ # elif -8388608 <= value <= 8388607:
1904
+ # return 'MEDIUMINT'
1905
+ if -2147483648 <= value <= 2147483647:
1906
+ return 'INT'
1907
+ else:
1908
+ return 'BIGINT'
1909
+ elif isinstance(value, float):
1910
+ return 'DECIMAL(10,2)'
1911
+ elif isinstance(value, (datetime.datetime, pd.Timestamp)):
1912
+ return 'DATETIME'
1913
+ elif isinstance(value, datetime.date):
1914
+ return 'DATE'
1915
+ elif isinstance(value, (list, dict)):
1916
+ return 'JSON'
1917
+ elif isinstance(value, str):
1918
+ # 尝试判断是否是日期时间
1919
+ try:
1920
+ self._validate_datetime(value)
1921
+ return 'DATETIME'
1922
+ except ValueError:
1923
+ pass
1924
+
1925
+ # 根据字符串长度选择合适类型
1926
+ length = len(value)
1927
+ if length <= 255:
1928
+ return 'VARCHAR(255)'
1929
+ elif length <= 65535:
1930
+ return 'TEXT'
1931
+ elif length <= 16777215:
1932
+ return 'MEDIUMTEXT'
1933
+ else:
1934
+ return 'LONGTEXT'
1935
+ else:
1936
+ return 'VARCHAR(255)'
1937
+
1822
1938
  def _prepare_data(
1823
1939
  self,
1824
1940
  data: Union[Dict, List[Dict], pd.DataFrame],
@@ -1831,27 +1947,57 @@ class MySQLUploader:
1831
1947
  :param data: 输入数据
1832
1948
  :param set_typ: 列名和数据类型字典 {列名: 数据类型}
1833
1949
  :param allow_null: 是否允许空值
1834
- :return: 准备好的数据列表
1950
+ :return: 待上传的数据列表和对应的数据类型
1835
1951
  :raises ValueError: 如果数据验证失败
1836
1952
  """
1837
1953
  # 统一数据格式为字典列表
1838
1954
  if isinstance(data, pd.DataFrame):
1839
1955
  try:
1956
+ # 将列名转为小写
1957
+ data.columns = [col.lower() for col in data.columns]
1840
1958
  data = data.replace({pd.NA: None}).to_dict('records')
1841
1959
  except Exception as e:
1842
- self._log_with_metrics("error", f"转为为字典时发生错误: {e}", )
1843
- raise ValueError(f"转为为字典时发生错误: {e}")
1960
+ self._log_with_metrics("error", f"数据转字典时发生错误: {e}", )
1961
+ raise ValueError(f"数据转字典时发生错误: {e}")
1844
1962
  elif isinstance(data, dict):
1845
- data = [data]
1846
- elif not isinstance(data, list) or not all(isinstance(item, dict) for item in data):
1847
- error_msg = "Data must be a dict, list of dicts, or DataFrame"
1963
+ data = [{k.lower(): v for k, v in data.items()}]
1964
+ elif isinstance(data, list) and all(isinstance(item, dict) for item in data):
1965
+ # 将列表中的每个字典键转为小写
1966
+ data = [{k.lower(): v for k, v in item.items()} for item in data]
1967
+ else:
1968
+ error_msg = "数据结构必须是字典、列表、字典列表或dataframe"
1848
1969
  self._log_with_metrics('error', error_msg)
1849
1970
  raise ValueError(error_msg)
1850
1971
 
1972
+ # 将set_typ的键转为小写
1973
+ set_typ = {k.lower(): v for k, v in set_typ.items()}
1974
+
1975
+ # 获取数据中实际存在的列名
1976
+ data_columns = set()
1977
+ if data:
1978
+ data_columns = set(data[0].keys())
1979
+
1980
+ # 过滤set_typ,只保留数据中存在的列
1981
+ filtered_set_typ = {}
1982
+ for col in data_columns:
1983
+ if col in set_typ:
1984
+ filtered_set_typ[col] = set_typ[col]
1985
+ else:
1986
+ # 如果列不在set_typ中,尝试推断类型
1987
+ sample_values = [row[col] for row in data if col in row and row[col] is not None][:10]
1988
+ if sample_values:
1989
+ inferred_type = self._infer_data_type(sample_values[0])
1990
+ filtered_set_typ[col] = inferred_type
1991
+ self._log_with_metrics('debug', f"自动推断列'{col}'的数据类型为: {inferred_type}")
1992
+ else:
1993
+ # 没有样本值,使用默认类型
1994
+ filtered_set_typ[col] = 'VARCHAR(255)'
1995
+ self._log_with_metrics('debug', f"为列'{col}'使用默认数据类型: VARCHAR(255)")
1996
+
1851
1997
  prepared_data = []
1852
1998
  for row_idx, row in enumerate(data, 1):
1853
1999
  prepared_row = {}
1854
- for col_name, col_type in set_typ.items():
2000
+ for col_name in filtered_set_typ:
1855
2001
  # 跳过id列,不允许外部传入id
1856
2002
  if col_name.lower() == 'id':
1857
2003
  continue
@@ -1864,7 +2010,7 @@ class MySQLUploader:
1864
2010
  prepared_row[col_name] = None
1865
2011
  else:
1866
2012
  try:
1867
- prepared_row[col_name] = self._validate_value(row[col_name], col_type)
2013
+ prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name])
1868
2014
  except ValueError as e:
1869
2015
  error_msg = f"Row {row_idx}, column '{col_name}': {str(e)}"
1870
2016
  self._log_with_metrics('error', error_msg)
@@ -1872,7 +2018,7 @@ class MySQLUploader:
1872
2018
  prepared_data.append(prepared_row)
1873
2019
 
1874
2020
  self._log_with_metrics('debug', f"已准备 {len(prepared_data)} 行数据")
1875
- return prepared_data
2021
+ return prepared_data, filtered_set_typ
1876
2022
 
1877
2023
  def upload_data(
1878
2024
  self,
@@ -1913,7 +2059,7 @@ class MySQLUploader:
1913
2059
  try:
1914
2060
  # 验证参数
1915
2061
  if not set_typ:
1916
- error_msg = "必须指定列定义"
2062
+ error_msg = "列的数据类型缺失"
1917
2063
  self._log_with_metrics('error', error_msg)
1918
2064
  raise ValueError(error_msg)
1919
2065
 
@@ -1923,7 +2069,7 @@ class MySQLUploader:
1923
2069
  raise ValueError(error_msg)
1924
2070
 
1925
2071
  # 准备数据
1926
- prepared_data = self._prepare_data(data, set_typ, allow_null)
2072
+ prepared_data, set_typ = self._prepare_data(data, set_typ, allow_null)
1927
2073
 
1928
2074
  # 检查数据库是否存在
1929
2075
  if not self._check_database_exists(db_name):
@@ -2019,7 +2165,17 @@ class MySQLUploader:
2019
2165
  batch_id: Optional[str] = None
2020
2166
  ):
2021
2167
  """
2022
- 插入数据到表中,增强日志记录和性能监控
2168
+ 插入数据到表中
2169
+
2170
+ 参数:
2171
+ db_name: 数据库名
2172
+ table_name: 表名
2173
+ data: 要插入的数据列表
2174
+ set_typ: 列名和数据类型字典 {列名: 数据类型}
2175
+ check_duplicate: 是否检查重复
2176
+ duplicate_columns: 用于检查重复的列(为空时检查所有列)
2177
+ batch_size: 批量插入大小
2178
+ batch_id: 批次ID用于日志追踪
2023
2179
  """
2024
2180
  if not data:
2025
2181
  return
@@ -2033,27 +2189,40 @@ class MySQLUploader:
2033
2189
  if check_duplicate:
2034
2190
  if not duplicate_columns:
2035
2191
  duplicate_columns = all_columns
2192
+ else:
2193
+ duplicate_columns = [col for col in duplicate_columns if col != 'id']
2194
+
2195
+ conditions = []
2196
+ for col in duplicate_columns:
2197
+ col_type = set_typ.get(col, '').lower()
2198
+
2199
+ # 处理DECIMAL类型,使用ROUND确保精度一致
2200
+ if col_type.startswith('decimal'):
2201
+ # 提取小数位数,如DECIMAL(10,2)提取2
2202
+ scale_match = re.search(r'decimal\(\d+,(\d+)\)', col_type)
2203
+ scale = int(scale_match.group(1)) if scale_match else 2
2204
+ conditions.append(f"ROUND(`{self._validate_identifier(col)}`, {scale}) = ROUND(%s, {scale})")
2205
+ else:
2206
+ conditions.append(f"`{self._validate_identifier(col)}` = %s")
2036
2207
 
2037
- safe_dup_columns = [self._validate_identifier(col) for col in duplicate_columns]
2038
- conditions = [f"`{col}` = %s" for col in safe_dup_columns]
2039
2208
  where_clause = " AND ".join(conditions)
2040
2209
 
2041
2210
  sql = f"""
2042
- INSERT INTO `{db_name}`.`{table_name}`
2043
- (`{'`,`'.join(safe_columns)}`)
2044
- SELECT {placeholders}
2045
- FROM DUAL
2046
- WHERE NOT EXISTS (
2047
- SELECT 1 FROM `{db_name}`.`{table_name}`
2048
- WHERE {where_clause}
2049
- )
2050
- """
2211
+ INSERT INTO `{db_name}`.`{table_name}`
2212
+ (`{'`,`'.join(safe_columns)}`)
2213
+ SELECT {placeholders}
2214
+ FROM DUAL
2215
+ WHERE NOT EXISTS (
2216
+ SELECT 1 FROM `{db_name}`.`{table_name}`
2217
+ WHERE {where_clause}
2218
+ )
2219
+ """
2051
2220
  else:
2052
2221
  sql = f"""
2053
- INSERT INTO `{db_name}`.`{table_name}`
2054
- (`{'`,`'.join(safe_columns)}`)
2055
- VALUES ({placeholders})
2056
- """
2222
+ INSERT INTO `{db_name}`.`{table_name}`
2223
+ (`{'`,`'.join(safe_columns)}`)
2224
+ VALUES ({placeholders})
2225
+ """
2057
2226
 
2058
2227
  total_inserted = 0
2059
2228
  total_skipped = 0
@@ -2069,7 +2238,9 @@ class MySQLUploader:
2069
2238
 
2070
2239
  for row in batch:
2071
2240
  try:
2241
+ # 准备参数
2072
2242
  row_values = [row.get(col) for col in all_columns]
2243
+ # 如果是排重检查,添加排重列值
2073
2244
  if check_duplicate:
2074
2245
  row_values += [row.get(col) for col in duplicate_columns]
2075
2246
 
@@ -2086,11 +2257,13 @@ class MySQLUploader:
2086
2257
  'batch_id': batch_id,
2087
2258
  'database': db_name,
2088
2259
  'table': table_name,
2089
- 'row_data': row,
2090
2260
  'error_type': type(e).__name__,
2091
- 'error_message': str(e)
2261
+ 'error_message': str(e),
2262
+ 'column_types': set_typ,
2263
+ 'duplicate_check': check_duplicate,
2264
+ 'duplicate_columns': duplicate_columns
2092
2265
  }
2093
- self._log_with_metrics('error', "单行插入失败", error_details)
2266
+ self._log_with_metrics('error', f"单行插入失败: {error_details}")
2094
2267
  continue # 跳过当前行,继续处理下一行
2095
2268
 
2096
2269
  # 更新统计信息
@@ -2115,7 +2288,7 @@ class MySQLUploader:
2115
2288
  'time_elapsed': batch_elapsed,
2116
2289
  'rows_per_second': successful_rows / batch_elapsed if batch_elapsed > 0 else 0
2117
2290
  }
2118
- self._log_with_metrics('debug', "批次处理完成", batch_info)
2291
+ self._log_with_metrics('debug', f"批次处理完成 {batch_info}")
2119
2292
 
2120
2293
  # 更新全局指标
2121
2294
  self.metrics['failed_rows'] += total_failed
@@ -2178,37 +2351,681 @@ class MySQLUploader:
2178
2351
  if hasattr(self, 'logger') and self.logger and self.enable_metrics:
2179
2352
  self._log_with_metrics('debug', "最终性能指标", self.get_metrics())
2180
2353
 
2181
- def __main__():
2182
- pass
2354
+ def _check_pool_health(self):
2355
+ """定期检查连接池健康状态"""
2356
+ try:
2357
+ conn = self.pool.connection()
2358
+ conn.ping(reconnect=True)
2359
+ conn.close()
2360
+ return True
2361
+ except Exception:
2362
+ self._log_with_metrics('warning', "连接池健康检查失败", {
2363
+ 'error': str(e)
2364
+ })
2365
+ return False
2366
+
2367
+ def retry_on_failure(max_retries=3, delay=1):
2368
+ def decorator(func):
2369
+ @wraps(func)
2370
+ def wrapper(*args, **kwargs):
2371
+ last_exception = None
2372
+ for attempt in range(max_retries):
2373
+ try:
2374
+ return func(*args, **kwargs)
2375
+ except (pymysql.OperationalError, pymysql.InterfaceError) as e:
2376
+ last_exception = e
2377
+ if attempt < max_retries - 1:
2378
+ time.sleep(delay * (attempt + 1))
2379
+ continue
2380
+ raise MySQLUploaderError(f"操作重试{max_retries}次后失败") from e
2381
+ except Exception as e:
2382
+ raise MySQLUploaderError(f"操作失败: {str(e)}") from e
2383
+ raise last_exception if last_exception else MySQLUploaderError("未知错误")
2183
2384
 
2385
+ return wrapper
2184
2386
 
2185
- if __name__ == '__main__':
2186
- pass
2387
+ return decorator
2388
+
2389
+
2390
+ class MySQLDeduplicator:
2391
+ """
2392
+ MySQL数据去重
2393
+
2394
+ 功能:
2395
+ 1. 自动检测并删除MySQL数据库中的重复数据
2396
+ 2. 支持全库扫描或指定表处理
2397
+ 3. 支持多线程/多进程安全处理
2398
+ 4. 完善的错误处理和日志记录
2399
+
2400
+ 使用示例:
2401
+ deduplicator = MySQLDeduplicator(
2402
+ username='root',
2403
+ password='password',
2404
+ host='localhost',
2405
+ port=3306
2406
+ )
2407
+
2408
+ # 全库去重
2409
+ deduplicator.deduplicate_all()
2410
+
2411
+ # 指定数据库去重(多线程)
2412
+ deduplicator.deduplicate_database('my_db', parallel=True)
2413
+
2414
+ # 指定表去重(使用特定列)
2415
+ deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
2416
+
2417
+ # 关闭连接
2418
+ deduplicator.close()
2419
+ """
2420
+
2421
+ def __init__(
2422
+ self,
2423
+ username: str,
2424
+ password: str,
2425
+ host: str = 'localhost',
2426
+ port: int = 3306,
2427
+ charset: str = 'utf8mb4',
2428
+ max_workers: int = 1,
2429
+ batch_size: int = 1000,
2430
+ skip_system_dbs: bool = True,
2431
+ logging_mode: str = 'console',
2432
+ log_level: str = 'INFO',
2433
+ log_file: str = 'mysql_deduplicate.log',
2434
+ max_retries: int = 3,
2435
+ retry_interval: int = 5,
2436
+ pool_size: int = 5
2437
+ ):
2438
+ """
2439
+ 初始化去重处理器
2440
+
2441
+ :param username: 数据库用户名
2442
+ :param password: 数据库密码
2443
+ :param host: 数据库主机,默认为localhost
2444
+ :param port: 数据库端口,默认为3306
2445
+ :param charset: 字符集,默认为utf8mb4
2446
+ :param max_workers: 最大工作线程数,默认为1(单线程)
2447
+ :param batch_size: 批量处理大小,默认为1000
2448
+ :param skip_system_dbs: 是否跳过系统数据库,默认为True
2449
+ :param logging_mode: 日志模式('console', 'file', 'both', 'none')
2450
+ :param log_level: 日志级别('DEBUG', 'INFO', 'WARNING', 'ERROR')
2451
+ :param log_file: 日志文件路径
2452
+ :param max_retries: 最大重试次数
2453
+ :param retry_interval: 重试间隔(秒)
2454
+ :param pool_size: 连接池大小
2455
+ """
2456
+ # 初始化连接池
2457
+ self.pool = PooledDB(
2458
+ creator=pymysql,
2459
+ host=host,
2460
+ port=port,
2461
+ user=username,
2462
+ password=password,
2463
+ charset=charset,
2464
+ maxconnections=pool_size,
2465
+ cursorclass=pymysql.cursors.DictCursor
2466
+ )
2467
+
2468
+ # 配置参数
2469
+ self.max_workers = max(1, min(max_workers, 20)) # 限制最大线程数
2470
+ self.batch_size = batch_size
2471
+ self.skip_system_dbs = skip_system_dbs
2472
+ self.max_retries = max_retries
2473
+ self.retry_interval = retry_interval
2474
+
2475
+ # 线程安全控制
2476
+ self._lock = threading.Lock()
2477
+ self._processing_tables = set() # 正在处理的表集合
2478
+
2479
+ # 初始化日志
2480
+ self._init_logging(logging_mode, log_level, log_file)
2481
+
2482
+ # 系统数据库列表
2483
+ self.SYSTEM_DATABASES = {
2484
+ 'information_schema', 'mysql',
2485
+ 'performance_schema', 'sys'
2486
+ }
2487
+
2488
+ def _init_logging(
2489
+ self,
2490
+ logging_mode: str,
2491
+ log_level: str,
2492
+ log_file: str
2493
+ ):
2494
+ """初始化日志配置"""
2495
+ self.logger = logging.getLogger('mysql_deduplicator')
2496
+ self.logger.setLevel(log_level.upper())
2497
+
2498
+ # 防止重复添加handler
2499
+ if self.logger.handlers:
2500
+ for handler in self.logger.handlers[:]:
2501
+ self.logger.removeHandler(handler)
2502
+
2503
+ formatter = logging.Formatter(
2504
+ '%(asctime)s - %(levelname)s - %(message)s',
2505
+ datefmt='%Y-%m-%d %H:%M:%S'
2506
+ )
2507
+
2508
+ mode = logging_mode.lower()
2509
+ if mode in ('both', 'console'):
2510
+ console_handler = logging.StreamHandler()
2511
+ console_handler.setFormatter(formatter)
2512
+ self.logger.addHandler(console_handler)
2513
+
2514
+ if mode in ('both', 'file'):
2515
+ file_handler = logging.FileHandler(
2516
+ filename=log_file,
2517
+ encoding='utf-8'
2518
+ )
2519
+ file_handler.setFormatter(formatter)
2520
+ self.logger.addHandler(file_handler)
2521
+
2522
+ def _log(self, level: str, message: str, extra: Optional[Dict] = None):
2523
+ """统一的日志记录方法"""
2524
+ if not hasattr(self.logger, level.lower()):
2525
+ return
2526
+
2527
+ # 简化日志内容,避免过长
2528
+ if len(message) > 500:
2529
+ message = message[:500] + '...'
2530
+
2531
+ log_method = getattr(self.logger, level.lower())
2532
+ log_method(message, extra=extra)
2533
+
2534
+ def _get_connection(self):
2535
+ """从连接池获取连接"""
2536
+ try:
2537
+ conn = self.pool.connection()
2538
+ self._log('debug', "成功获取数据库连接")
2539
+ return conn
2540
+ except Exception as e:
2541
+ self._log('error', f"获取数据库连接失败: {str(e)}")
2542
+ raise ConnectionError(f"连接数据库失败: {str(e)}")
2543
+
2544
+ @staticmethod
2545
+ def _retry_on_failure(func):
2546
+ """重试装饰器"""
2547
+
2548
+ @wraps(func)
2549
+ def wrapper(self, *args, **kwargs):
2550
+ last_exception = None
2551
+ for attempt in range(self.max_retries + 1):
2552
+ try:
2553
+ return func(self, *args, **kwargs)
2554
+ except (pymysql.OperationalError, pymysql.InterfaceError) as e:
2555
+ last_exception = e
2556
+ if attempt < self.max_retries:
2557
+ wait_time = self.retry_interval * (attempt + 1)
2558
+ self._log('warning',
2559
+ f"数据库操作失败,准备重试 (尝试 {attempt + 1}/{self.max_retries})",
2560
+ {'error': str(e), 'wait_time': wait_time})
2561
+ time.sleep(wait_time)
2562
+ continue
2563
+ except Exception as e:
2564
+ last_exception = e
2565
+ self._log('error',
2566
+ f"操作失败: {str(e)}",
2567
+ {'error_type': type(e).__name__})
2568
+ break
2569
+
2570
+ if last_exception:
2571
+ raise last_exception
2572
+ raise Exception("未知错误")
2573
+
2574
+ return wrapper
2575
+
2576
+ @_retry_on_failure
2577
+ def _get_databases(self) -> List[str]:
2578
+ """获取所有非系统数据库列表"""
2579
+ sql = "SHOW DATABASES"
2580
+
2581
+ with self._get_connection() as conn:
2582
+ with conn.cursor() as cursor:
2583
+ cursor.execute(sql)
2584
+ all_dbs = [row['Database'] for row in cursor.fetchall()]
2585
+
2586
+ if self.skip_system_dbs:
2587
+ return [db for db in all_dbs if db.lower() not in self.SYSTEM_DATABASES]
2588
+ return all_dbs
2589
+
2590
+ @_retry_on_failure
2591
+ def _get_tables(self, database: str) -> List[str]:
2592
+ """获取指定数据库的所有表"""
2593
+ sql = "SHOW TABLES"
2594
+
2595
+ with self._get_connection() as conn:
2596
+ with conn.cursor() as cursor:
2597
+ cursor.execute(f"USE `{database}`")
2598
+ cursor.execute(sql)
2599
+ return [row[f'Tables_in_{database}'] for row in cursor.fetchall()]
2600
+
2601
+ @_retry_on_failure
2602
+ def _get_table_columns(self, database: str, table: str) -> List[str]:
2603
+ """获取表的列名(排除id列)"""
2604
+ sql = """
2605
+ SELECT COLUMN_NAME
2606
+ FROM INFORMATION_SCHEMA.COLUMNS
2607
+ WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
2608
+ ORDER BY ORDINAL_POSITION
2609
+ """
2610
+
2611
+ with self._get_connection() as conn:
2612
+ with conn.cursor() as cursor:
2613
+ cursor.execute(sql, (database, table))
2614
+ return [row['COLUMN_NAME'] for row in cursor.fetchall()
2615
+ if row['COLUMN_NAME'].lower() != 'id']
2616
+
2617
+ def _acquire_table_lock(self, database: str, table: str) -> bool:
2618
+ """获取表处理锁,防止并发处理同一张表"""
2619
+ key = f"{database}.{table}"
2620
+
2621
+ with self._lock:
2622
+ if key in self._processing_tables:
2623
+ self._log('debug', f"表 {key} 正在被其他线程处理,跳过")
2624
+ return False
2625
+ self._processing_tables.add(key)
2626
+ return True
2627
+
2628
+ def _release_table_lock(self, database: str, table: str):
2629
+ """释放表处理锁"""
2630
+ key = f"{database}.{table}"
2631
+
2632
+ with self._lock:
2633
+ if key in self._processing_tables:
2634
+ self._processing_tables.remove(key)
2635
+
2636
+ def _deduplicate_table(
2637
+ self,
2638
+ database: str,
2639
+ table: str,
2640
+ columns: Optional[List[str]] = None,
2641
+ dry_run: bool = False
2642
+ ) -> Tuple[int, int]:
2643
+ """
2644
+ 执行单表去重
2645
+
2646
+ :param database: 数据库名
2647
+ :param table: 表名
2648
+ :param columns: 用于去重的列(为None时使用所有列)
2649
+ :param dry_run: 是否模拟运行(只统计不实际删除)
2650
+ :return: (重复行数, 删除行数)
2651
+ """
2652
+ if not self._acquire_table_lock(database, table):
2653
+ return (0, 0)
2654
+
2655
+ try:
2656
+ self._log('info', f"开始处理表: {database}.{table}")
2657
+
2658
+ # 获取实际列名
2659
+ all_columns = self._get_table_columns(database, table)
2660
+ if not all_columns:
2661
+ self._log('warning', f"表 {database}.{table} 没有有效列(可能只有id列),跳过")
2662
+ return (0, 0)
2663
+
2664
+ # 使用指定列或所有列
2665
+ use_columns = columns or all_columns
2666
+ invalid_columns = set(use_columns) - set(all_columns)
2667
+
2668
+ if invalid_columns:
2669
+ self._log('warning',
2670
+ f"表 {database}.{table} 中不存在以下列: {invalid_columns},使用有效列",
2671
+ {'invalid_columns': invalid_columns}
2672
+ )
2673
+ use_columns = [col for col in use_columns if col in all_columns]
2674
+
2675
+ if not use_columns:
2676
+ self._log('error', f"表 {database}.{table} 没有有效的去重列")
2677
+ return (0, 0)
2678
+
2679
+ # 构建去重SQL
2680
+ column_list = ', '.join([f'`{col}`' for col in use_columns])
2681
+ temp_table = f"temp_{table}_{int(time.time())}"
2682
+
2683
+ # 使用临时表方案处理去重,避免锁表问题
2684
+ create_temp_sql = f"""
2685
+ CREATE TABLE `{database}`.`{temp_table}` AS
2686
+ SELECT MIN(`id`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
2687
+ FROM `{database}`.`{table}`
2688
+ GROUP BY {column_list}
2689
+ HAVING COUNT(*) > 1
2690
+ """
2691
+
2692
+ delete_dup_sql = f"""
2693
+ DELETE FROM `{database}`.`{table}`
2694
+ WHERE `id` NOT IN (
2695
+ SELECT `min_id` FROM `{database}`.`{temp_table}`
2696
+ ) AND ({' OR '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
2697
+ """
2698
+
2699
+ drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
2700
+
2701
+ with self._get_connection() as conn:
2702
+ with conn.cursor() as cursor:
2703
+ # 创建临时表统计重复数据
2704
+ cursor.execute(create_temp_sql)
2705
+ cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`")
2706
+ dup_count = cursor.fetchone()['cnt']
2707
+
2708
+ if dup_count == 0:
2709
+ self._log('info', f"表 {database}.{table} 没有重复数据")
2710
+ cursor.execute(drop_temp_sql)
2711
+ conn.commit()
2712
+ return (0, 0)
2713
+
2714
+ self._log('info',
2715
+ f"表 {database}.{table} 发现 {dup_count} 组重复数据",
2716
+ {'columns': use_columns}
2717
+ )
2718
+
2719
+ if not dry_run:
2720
+ # 执行实际删除
2721
+ cursor.execute(delete_dup_sql)
2722
+ affected_rows = cursor.rowcount
2723
+ conn.commit()
2724
+ self._log('info',
2725
+ f"表 {database}.{table} 已删除 {affected_rows} 行重复数据",
2726
+ {'columns': use_columns}
2727
+ )
2728
+ else:
2729
+ affected_rows = 0
2730
+ self._log('info',
2731
+ f"[模拟运行] 表 {database}.{table} 将删除 {dup_count} 组重复数据",
2732
+ {'columns': use_columns}
2733
+ )
2734
+
2735
+ # 清理临时表
2736
+ cursor.execute(drop_temp_sql)
2737
+ conn.commit()
2738
+
2739
+ return (dup_count, affected_rows)
2740
+
2741
+ except Exception as e:
2742
+ self._log('error',
2743
+ f"处理表 {database}.{table} 时出错: {str(e)}",
2744
+ {'error_type': type(e).__name__}
2745
+ )
2746
+ return (0, 0)
2747
+ finally:
2748
+ self._release_table_lock(database, table)
2749
+
2750
+ def deduplicate_table(
2751
+ self,
2752
+ database: str,
2753
+ table: str,
2754
+ columns: Optional[List[str]] = None,
2755
+ dry_run: bool = False
2756
+ ) -> Tuple[int, int]:
2757
+ """
2758
+ 对指定表进行去重
2759
+
2760
+ :param database: 数据库名
2761
+ :param table: 表名
2762
+ :param columns: 用于去重的列(为None时使用所有列)
2763
+ :param dry_run: 是否模拟运行(只统计不实际删除)
2764
+ :return: (重复行数, 删除行数)
2765
+ """
2766
+ try:
2767
+ # 检查表是否存在
2768
+ if not self._check_table_exists(database, table):
2769
+ self._log('warning', f"表 {database}.{table} 不存在,跳过")
2770
+ return (0, 0)
2771
+
2772
+ return self._deduplicate_table(database, table, columns, dry_run)
2773
+ except Exception as e:
2774
+ self._log('error',
2775
+ f"处理表 {database}.{table} 时发生全局错误: {str(e)}",
2776
+ {'error_type': type(e).__name__}
2777
+ )
2778
+ return (0, 0)
2779
+
2780
+ def deduplicate_database(
2781
+ self,
2782
+ database: str,
2783
+ tables: Optional[List[str]] = None,
2784
+ columns_map: Optional[Dict[str, List[str]]] = None,
2785
+ dry_run: bool = False,
2786
+ parallel: bool = False
2787
+ ) -> Dict[str, Tuple[int, int]]:
2788
+ """
2789
+ 对指定数据库的所有表进行去重
2790
+
2791
+ :param database: 数据库名
2792
+ :param tables: 要处理的表列表(为None时处理所有表)
2793
+ :param columns_map: 各表使用的去重列 {表名: [列名]}
2794
+ :param dry_run: 是否模拟运行
2795
+ :param parallel: 是否并行处理
2796
+ :return: 字典 {表名: (重复行数, 删除行数)}
2797
+ """
2798
+ results = {}
2799
+
2800
+ try:
2801
+ # 检查数据库是否存在
2802
+ if not self._check_database_exists(database):
2803
+ self._log('warning', f"数据库 {database} 不存在,跳过")
2804
+ return results
2805
+
2806
+ # 获取要处理的表
2807
+ target_tables = tables or self._get_tables(database)
2808
+ if not target_tables:
2809
+ self._log('info', f"数据库 {database} 中没有表,跳过")
2810
+ return results
2811
+
2812
+ self._log('info',
2813
+ f"开始处理数据库 {database} 中的 {len(target_tables)} 张表",
2814
+ {'tables': target_tables}
2815
+ )
2816
+
2817
+ if parallel and self.max_workers > 1:
2818
+ # 并行处理
2819
+ with concurrent.futures.ThreadPoolExecutor(
2820
+ max_workers=self.max_workers
2821
+ ) as executor:
2822
+ futures = {}
2823
+ for table in target_tables:
2824
+ columns = columns_map.get(table) if columns_map else None
2825
+ futures[executor.submit(
2826
+ self.deduplicate_table,
2827
+ database, table, columns, dry_run
2828
+ )] = table
2829
+
2830
+ for future in concurrent.futures.as_completed(futures):
2831
+ table = futures[future]
2832
+ try:
2833
+ dup_count, affected_rows = future.result()
2834
+ results[table] = (dup_count, affected_rows)
2835
+ except Exception as e:
2836
+ self._log('error',
2837
+ f"处理表 {database}.{table} 时出错: {str(e)}",
2838
+ {'error_type': type(e).__name__}
2839
+ )
2840
+ results[table] = (0, 0)
2841
+ else:
2842
+ # 串行处理
2843
+ for table in target_tables:
2844
+ columns = columns_map.get(table) if columns_map else None
2845
+ dup_count, affected_rows = self.deduplicate_table(
2846
+ database, table, columns, dry_run
2847
+ )
2848
+ results[table] = (dup_count, affected_rows)
2187
2849
 
2188
- # 初始化上传器
2850
+ # 统计结果
2851
+ total_dup = sum(r[0] for r in results.values())
2852
+ total_del = sum(r[1] for r in results.values())
2853
+
2854
+ self._log('info',
2855
+ f"数据库 {database} 处理完成 - 共发现 {total_dup} 组重复数据,删除 {total_del} 行",
2856
+ {'results': results}
2857
+ )
2858
+
2859
+ return results
2860
+
2861
+ except Exception as e:
2862
+ self._log('error',
2863
+ f"处理数据库 {database} 时发生全局错误: {str(e)}",
2864
+ {'error_type': type(e).__name__}
2865
+ )
2866
+ return results
2867
+
2868
+ def deduplicate_all(
2869
+ self,
2870
+ databases: Optional[List[str]] = None,
2871
+ tables_map: Optional[Dict[str, List[str]]] = None,
2872
+ columns_map: Optional[Dict[str, Dict[str, List[str]]]] = None,
2873
+ dry_run: bool = False,
2874
+ parallel: bool = False
2875
+ ) -> Dict[str, Dict[str, Tuple[int, int]]]:
2876
+ """
2877
+ 对所有数据库进行去重
2878
+
2879
+ :param databases: 要处理的数据库列表(为None时处理所有非系统数据库)
2880
+ :param tables_map: 各数据库要处理的表 {数据库名: [表名]}
2881
+ :param columns_map: 各表使用的去重列 {数据库名: {表名: [列名]}}
2882
+ :param dry_run: 是否模拟运行
2883
+ :param parallel: 是否并行处理
2884
+ :return: 嵌套字典 {数据库名: {表名: (重复行数, 删除行数)}}
2885
+ """
2886
+ all_results = defaultdict(dict)
2887
+
2888
+ try:
2889
+ # 获取要处理的数据库
2890
+ target_dbs = databases or self._get_databases()
2891
+ if not target_dbs:
2892
+ self._log('warning', "没有可处理的数据库")
2893
+ return all_results
2894
+
2895
+ self._log('info',
2896
+ f"开始处理 {len(target_dbs)} 个数据库",
2897
+ {'databases': target_dbs}
2898
+ )
2899
+
2900
+ if parallel and self.max_workers > 1:
2901
+ # 并行处理数据库
2902
+ with concurrent.futures.ThreadPoolExecutor(
2903
+ max_workers=self.max_workers
2904
+ ) as executor:
2905
+ futures = {}
2906
+ for db in target_dbs:
2907
+ tables = tables_map.get(db) if tables_map else None
2908
+ db_columns_map = columns_map.get(db) if columns_map else None
2909
+ futures[executor.submit(
2910
+ self.deduplicate_database,
2911
+ db, tables, db_columns_map, dry_run, False
2912
+ )] = db
2913
+
2914
+ for future in concurrent.futures.as_completed(futures):
2915
+ db = futures[future]
2916
+ try:
2917
+ db_results = future.result()
2918
+ all_results[db] = db_results
2919
+ except Exception as e:
2920
+ self._log('error',
2921
+ f"处理数据库 {db} 时出错: {str(e)}",
2922
+ {'error_type': type(e).__name__}
2923
+ )
2924
+ all_results[db] = {}
2925
+ else:
2926
+ # 串行处理数据库
2927
+ for db in target_dbs:
2928
+ tables = tables_map.get(db) if tables_map else None
2929
+ db_columns_map = columns_map.get(db) if columns_map else None
2930
+ db_results = self.deduplicate_database(
2931
+ db, tables, db_columns_map, dry_run, parallel
2932
+ )
2933
+ all_results[db] = db_results
2934
+
2935
+ # 统计总体结果
2936
+ total_dup = sum(
2937
+ r[0] for db in all_results.values()
2938
+ for r in db.values()
2939
+ )
2940
+ total_del = sum(
2941
+ r[1] for db in all_results.values()
2942
+ for r in db.values()
2943
+ )
2944
+
2945
+ self._log('info',
2946
+ f"所有数据库处理完成 - 共发现 {total_dup} 组重复数据,删除 {total_del} 行",
2947
+ {'total_results': all_results}
2948
+ )
2949
+
2950
+ return all_results
2951
+
2952
+ except Exception as e:
2953
+ self._log('error',
2954
+ f"全局处理时发生错误: {str(e)}",
2955
+ {'error_type': type(e).__name__}
2956
+ )
2957
+ return all_results
2958
+
2959
+ @_retry_on_failure
2960
+ def _check_database_exists(self, database: str) -> bool:
2961
+ """检查数据库是否存在"""
2962
+ sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
2963
+
2964
+ with self._get_connection() as conn:
2965
+ with conn.cursor() as cursor:
2966
+ cursor.execute(sql, (database,))
2967
+ return bool(cursor.fetchone())
2968
+
2969
+ @_retry_on_failure
2970
+ def _check_table_exists(self, database: str, table: str) -> bool:
2971
+ """检查表是否存在"""
2972
+ sql = """
2973
+ SELECT TABLE_NAME
2974
+ FROM INFORMATION_SCHEMA.TABLES
2975
+ WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
2976
+ """
2977
+
2978
+ with self._get_connection() as conn:
2979
+ with conn.cursor() as cursor:
2980
+ cursor.execute(sql, (database, table))
2981
+ return bool(cursor.fetchone())
2982
+
2983
+ def close(self):
2984
+ """关闭连接池"""
2985
+ try:
2986
+ if hasattr(self, 'pool') and self.pool:
2987
+ self.pool.close()
2988
+ self._log('info', "数据库连接池已关闭")
2989
+ except Exception as e:
2990
+ self._log('error',
2991
+ f"关闭连接池时出错: {str(e)}",
2992
+ {'error_type': type(e).__name__}
2993
+ )
2994
+ finally:
2995
+ self.pool = None
2996
+
2997
+ def __enter__(self):
2998
+ return self
2999
+
3000
+ def __exit__(self, exc_type, exc_val, exc_tb):
3001
+ self.close()
3002
+
3003
+
3004
+ def main():
2189
3005
  uploader = MySQLUploader(
2190
3006
  username='root',
2191
3007
  password='1',
2192
3008
  host='localhost',
2193
3009
  port=3306,
2194
- logging_mode='both',
3010
+ logging_mode='console',
2195
3011
  log_level='info'
2196
3012
  )
2197
3013
 
2198
3014
  # 定义列和数据类型
2199
3015
  set_typ = {
2200
- 'id': 'INT',
2201
3016
  'name': 'VARCHAR(255)',
2202
3017
  'age': 'INT',
2203
3018
  'salary': 'DECIMAL(10,2)',
2204
- '日期': 'DATE'
3019
+ '日期': 'DATE',
3020
+ 'shop': None,
2205
3021
  }
2206
3022
 
2207
3023
  # 准备数据
2208
3024
  data = [
2209
- {'日期': '2023-01-15', 'name': 'Alice', 'age': 35, 'salary': 100},
2210
- {'日期': '2023-01-15', 'name': 'Alice', 'age': 30, 'salary': 0.0},
2211
- {'日期': '2023-02-20', 'name': 'Bob', 'age': 25, 'salary': 45000.75}
3025
+ {'日期': '2023-01-8', 'name': 'JACk', 'AGE': '24', 'salary': 555.1545},
3026
+ {'日期': '2023-01-15', 'name': 'Alice', 'AGE': 35, 'salary': 100},
3027
+ {'日期': '2023-01-15', 'name': 'Alice', 'AGE': 30, 'salary': 0.0},
3028
+ {'日期': '2023-02-20', 'name': 'Bob', 'AGE': 25, 'salary': 45000.75}
2212
3029
  ]
2213
3030
 
2214
3031
  # 上传数据
@@ -2217,15 +3034,40 @@ if __name__ == '__main__':
2217
3034
  table_name='测试表',
2218
3035
  data=data,
2219
3036
  set_typ=set_typ, # 定义列和数据类型
2220
- primary_keys=[], # 指定主键
2221
- check_duplicate=True, # 检查重复数据
2222
- duplicate_columns=['name', 'age'], #
3037
+ primary_keys=[], # 创建唯一主键
3038
+ check_duplicate=False, # 检查重复数据
3039
+ duplicate_columns=[], # 指定排重的组合键
2223
3040
  allow_null=False, # 允许插入空值
2224
3041
  partition_by='year', # 按月分表
2225
- partition_date_column = '日期', # 用于分表的日期列名,默认为'日期'
2226
- auto_create = True, # 表不存在时自动创建, 默认参数不要更改
2227
- indexes = ['name'], # 指定索引列
3042
+ partition_date_column='日期', # 用于分表的日期列名,默认为'日期'
3043
+ auto_create=True, # 表不存在时自动创建, 默认参数不要更改
3044
+ indexes=[], # 指定索引列
2228
3045
  )
2229
3046
 
2230
- # 关闭上传器
2231
3047
  uploader.close()
3048
+
3049
+
3050
+ def main2():
3051
+ deduplicator = MySQLDeduplicator(
3052
+ username='root',
3053
+ password='1',
3054
+ host='localhost',
3055
+ port=3306
3056
+ )
3057
+
3058
+ # # 全库去重(单线程)
3059
+ # deduplicator.deduplicate_all()
3060
+
3061
+ # # 指定数据库去重(多线程)
3062
+ # deduplicator.deduplicate_database('my_db', parallel=True)
3063
+
3064
+ # 指定表去重(使用特定列)
3065
+ deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
3066
+
3067
+ # 关闭连接
3068
+ deduplicator.close()
3069
+
3070
+ if __name__ == '__main__':
3071
+ pass
3072
+
3073
+ main2()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 3.9.2
3
+ Version: 3.9.4
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,5 +1,5 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=x030kSR5wz8nf_l9kAxmL-5kk7A84GeAWO_CGB1N2Cw,17
2
+ mdbq/__version__.py,sha256=44Qvc6l4hjIIQuGixaBICZNZB9jeL2ztNkT4fkONEBc,17
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
4
  mdbq/aggregation/optimize.py,sha256=2oalzD9weZhDclUC22OLxYa8Zj7KnmsGUoUau_Jlyc4,19796
5
5
  mdbq/aggregation/query_data.py,sha256=5_OzjGR5Sq00q-EgAYmSE5V9i4Solw9y4hkldl4mvt8,179808
@@ -8,7 +8,7 @@ mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
8
8
  mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
9
9
  mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
10
10
  mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
11
- mdbq/mysql/mysql.py,sha256=9e1mT12gWE8-Vld-E52EAtoAxKqQVmTlG7aGUXO17vo,99908
11
+ mdbq/mysql/mysql.py,sha256=ylGvSzFE2B78y77wG266tf_RaEuETnngqDKUTqjQCjs,132378
12
12
  mdbq/mysql/s_query.py,sha256=X055aLRAgxVvueXx4NbfNjp6MyBI02_XBb1pTKw09L0,8660
13
13
  mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
14
14
  mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
@@ -22,7 +22,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
22
22
  mdbq/redis/getredis.py,sha256=Uk8-cOWT0JU1qRyIVqdbYokSLvkDIAfcokmYj1ebw8k,24104
23
23
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
24
24
  mdbq/spider/aikucun.py,sha256=OhyEv1VyAKTOHjLDM37iNDQeRg5OnrNoKODoG2VxHes,19806
25
- mdbq-3.9.2.dist-info/METADATA,sha256=6Swdffh2m_pfD0XyeKyJGbehkwcuhWFiJtHakbCuBDQ,363
26
- mdbq-3.9.2.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
27
- mdbq-3.9.2.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
28
- mdbq-3.9.2.dist-info/RECORD,,
25
+ mdbq-3.9.4.dist-info/METADATA,sha256=1FQB3vRRNlxontQEXd6gE-RhnHbjAPOZcnc_Xh9I4B0,363
26
+ mdbq-3.9.4.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
27
+ mdbq-3.9.4.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
28
+ mdbq-3.9.4.dist-info/RECORD,,
File without changes