mdbq 3.9.6__py3-none-any.whl → 3.9.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/log/mylogger.py +57 -7
- mdbq/mysql/deduplicator.py +595 -0
- mdbq/mysql/mysql.py +146 -431
- mdbq/mysql/uploader.py +1151 -0
- {mdbq-3.9.6.dist-info → mdbq-3.9.7.dist-info}/METADATA +1 -1
- {mdbq-3.9.6.dist-info → mdbq-3.9.7.dist-info}/RECORD +9 -7
- {mdbq-3.9.6.dist-info → mdbq-3.9.7.dist-info}/WHEEL +0 -0
- {mdbq-3.9.6.dist-info → mdbq-3.9.7.dist-info}/top_level.txt +0 -0
mdbq/mysql/mysql.py
CHANGED
@@ -12,6 +12,7 @@ import os
|
|
12
12
|
import logging
|
13
13
|
import logging.handlers
|
14
14
|
from mdbq.other import otk
|
15
|
+
from mdbq.log import mylogger
|
15
16
|
from typing import Union, List, Dict, Optional, Any, Tuple, Set
|
16
17
|
from dbutils.pooled_db import PooledDB
|
17
18
|
import json
|
@@ -27,7 +28,18 @@ warnings.filterwarnings('ignore')
|
|
27
28
|
建表流程:
|
28
29
|
建表规范:
|
29
30
|
"""
|
30
|
-
logger =
|
31
|
+
logger = mylogger.MyLogger(
|
32
|
+
name='mysql',
|
33
|
+
logging_mode='both',
|
34
|
+
log_level='info',
|
35
|
+
log_file='mysql.log',
|
36
|
+
log_format='json',
|
37
|
+
max_log_size=50,
|
38
|
+
backup_count=5,
|
39
|
+
enable_async=False, # 是否启用异步日志
|
40
|
+
sample_rate=0.5, # 采样50%的DEBUG/INFO日志
|
41
|
+
sensitive_fields=[], # 敏感字段列表
|
42
|
+
)
|
31
43
|
|
32
44
|
|
33
45
|
def count_decimal_places(num_str):
|
@@ -1136,19 +1148,13 @@ class MySQLUploader:
|
|
1136
1148
|
port: int = 3306,
|
1137
1149
|
charset: str = 'utf8mb4',
|
1138
1150
|
collation: str = 'utf8mb4_0900_ai_ci', # utf8mb4_0900_ai_ci: 该排序规则对大小写不敏感, utf8mb4_0900_as_cs/utf8mb4_bin: 对大小写敏感
|
1139
|
-
logging_mode: str = 'console', # 'both'(控制台+文件), 'console'(仅控制台), 'file'(仅文件), 'none'(禁用)
|
1140
|
-
log_level: str = 'INFO', # 默认日志级别
|
1141
|
-
log_file: str = 'mysql_upload.log', # 日志文件路径
|
1142
|
-
max_log_size: int = 50, # 日志文件大小(MB)
|
1143
|
-
backup_count: int = 5, # 保留的日志文件数量
|
1144
1151
|
max_retries: int = 10,
|
1145
1152
|
retry_interval: int = 10,
|
1146
1153
|
pool_size: int = 5,
|
1147
1154
|
connect_timeout: int = 10,
|
1148
1155
|
read_timeout: int = 30,
|
1149
1156
|
write_timeout: int = 30,
|
1150
|
-
ssl: Optional[Dict] = None
|
1151
|
-
enable_metrics: bool = True # 是否启用性能指标收集
|
1157
|
+
ssl: Optional[Dict] = None
|
1152
1158
|
):
|
1153
1159
|
"""
|
1154
1160
|
:param username: 数据库用户名
|
@@ -1157,11 +1163,7 @@ class MySQLUploader:
|
|
1157
1163
|
:param port: 数据库端口,默认为3306
|
1158
1164
|
:param charset: 字符集,默认为utf8mb4
|
1159
1165
|
:param collation: 排序规则,默认为utf8mb4_0900_ai_ci
|
1160
|
-
|
1161
|
-
:param log_level: 日志级别,默认为INFO
|
1162
|
-
:param log_file: 日志文件路径
|
1163
|
-
:param max_log_size: 日志文件最大大小(MB),默认为50
|
1164
|
-
:param backup_count: 保留的日志备份数量,默认为5
|
1166
|
+
|
1165
1167
|
:param max_retries: 最大重试次数,默认为10
|
1166
1168
|
:param retry_interval: 重试间隔(秒),默认为10
|
1167
1169
|
:param pool_size: 连接池大小,默认为5
|
@@ -1169,7 +1171,6 @@ class MySQLUploader:
|
|
1169
1171
|
:param read_timeout: 读取超时(秒),默认为30
|
1170
1172
|
:param write_timeout: 写入超时(秒),默认为30
|
1171
1173
|
:param ssl: SSL配置字典,默认为None
|
1172
|
-
:param enable_metrics: 是否启用性能指标收集,默认为True
|
1173
1174
|
"""
|
1174
1175
|
self.username = username
|
1175
1176
|
self.password = password
|
@@ -1186,178 +1187,12 @@ class MySQLUploader:
|
|
1186
1187
|
self.ssl = ssl
|
1187
1188
|
self._prepared_statements = StatementCache(maxsize=100)
|
1188
1189
|
self._max_cached_statements = 100
|
1189
|
-
self.
|
1190
|
-
self.
|
1191
|
-
'total_uploads': 0,
|
1192
|
-
'successful_uploads': 0,
|
1193
|
-
'failed_uploads': 0,
|
1194
|
-
'total_rows': 0,
|
1195
|
-
'successful_rows': 0,
|
1196
|
-
'failed_rows': 0,
|
1197
|
-
'total_retries': 0,
|
1198
|
-
'total_execution_time': 0.0,
|
1199
|
-
'connection_usage': [],
|
1200
|
-
'memory_usage': [],
|
1201
|
-
'cpu_usage': []
|
1202
|
-
}
|
1203
|
-
self._last_metrics_time = 0
|
1204
|
-
self._metrics_cache = {} # 缓存最近一次的系统指标
|
1205
|
-
self.metrics_interval = 30 # 指标采集频率控制
|
1206
|
-
self._table_metadata_cache = {} # 元信息缓存
|
1207
|
-
self.metadata_cache_ttl = 300 # 元信息缓存频率控制
|
1208
|
-
|
1209
|
-
# 初始化日志系统
|
1210
|
-
self._init_logging(logging_mode, log_level, log_file, max_log_size, backup_count)
|
1190
|
+
self._table_metadata_cache = {}
|
1191
|
+
self.metadata_cache_ttl = 300 # 5分钟缓存时间
|
1211
1192
|
|
1212
1193
|
# 创建连接池
|
1213
1194
|
self.pool = self._create_connection_pool()
|
1214
1195
|
|
1215
|
-
def _init_logging(
|
1216
|
-
self,
|
1217
|
-
logging_mode: str,
|
1218
|
-
log_level: str,
|
1219
|
-
log_file: str,
|
1220
|
-
max_log_size: int,
|
1221
|
-
backup_count: int
|
1222
|
-
):
|
1223
|
-
"""初始化结构化日志配置"""
|
1224
|
-
if logging_mode.lower() == 'none':
|
1225
|
-
self.logger = None
|
1226
|
-
return
|
1227
|
-
|
1228
|
-
valid_levels = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
|
1229
|
-
level = log_level.upper() if log_level.upper() in valid_levels else 'INFO'
|
1230
|
-
|
1231
|
-
# 创建格式化器 - 使用结构化JSON格式
|
1232
|
-
class StructuredFormatter(logging.Formatter):
|
1233
|
-
def format(self, record):
|
1234
|
-
log_data = {
|
1235
|
-
'time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
1236
|
-
'level': record.levelname,
|
1237
|
-
'message': record.getMessage(),
|
1238
|
-
# 'logger': record.name,
|
1239
|
-
'module': record.module,
|
1240
|
-
'line': record.lineno,
|
1241
|
-
# 'process': record.process
|
1242
|
-
}
|
1243
|
-
|
1244
|
-
# 添加异常信息
|
1245
|
-
if record.exc_info:
|
1246
|
-
log_data['exception'] = self.formatException(record.exc_info)
|
1247
|
-
|
1248
|
-
# 过滤敏感信息
|
1249
|
-
if hasattr(record, 'password'):
|
1250
|
-
log_data['message'] = log_data['message'].replace(self.password, '***')
|
1251
|
-
|
1252
|
-
return json.dumps(log_data, ensure_ascii=False)
|
1253
|
-
|
1254
|
-
# 创建日志记录器
|
1255
|
-
self.logger = logging.getLogger('upload')
|
1256
|
-
self.logger.setLevel(level)
|
1257
|
-
|
1258
|
-
# 防止重复添加handler
|
1259
|
-
if self.logger.handlers:
|
1260
|
-
for handler in self.logger.handlers[:]:
|
1261
|
-
self.logger.removeHandler(handler)
|
1262
|
-
|
1263
|
-
formatter = StructuredFormatter()
|
1264
|
-
mode = logging_mode.lower()
|
1265
|
-
|
1266
|
-
# 根据模式添加相应的handler
|
1267
|
-
if mode in ('both', 'console'):
|
1268
|
-
console_handler = logging.StreamHandler()
|
1269
|
-
console_handler.setFormatter(formatter)
|
1270
|
-
self.logger.addHandler(console_handler)
|
1271
|
-
|
1272
|
-
if mode in ('both', 'file'):
|
1273
|
-
file_handler = logging.handlers.RotatingFileHandler(
|
1274
|
-
filename=log_file,
|
1275
|
-
maxBytes=max_log_size * 1024 * 1024,
|
1276
|
-
backupCount=backup_count,
|
1277
|
-
encoding='utf-8'
|
1278
|
-
)
|
1279
|
-
file_handler.setFormatter(formatter)
|
1280
|
-
self.logger.addHandler(file_handler)
|
1281
|
-
|
1282
|
-
def _record_metrics(self, metric_name: str, value: Any = 1, is_timing: bool = False):
|
1283
|
-
"""记录性能指标"""
|
1284
|
-
if not self.enable_metrics:
|
1285
|
-
return
|
1286
|
-
|
1287
|
-
# 对于频繁调用的指标,使用更高效的数据结构
|
1288
|
-
if metric_name in ('total_uploads', 'successful_uploads', 'failed_uploads'):
|
1289
|
-
self.metrics[metric_name] = self.metrics.get(metric_name, 0) + value
|
1290
|
-
return
|
1291
|
-
|
1292
|
-
if metric_name not in self.metrics:
|
1293
|
-
self.metrics[metric_name] = []
|
1294
|
-
|
1295
|
-
if is_timing:
|
1296
|
-
# 如果是时间指标,记录时间戳和值
|
1297
|
-
self.metrics[metric_name].append({
|
1298
|
-
'timestamp': datetime.datetime.now().isoformat(),
|
1299
|
-
'value': value
|
1300
|
-
})
|
1301
|
-
else:
|
1302
|
-
# 其他指标直接累加
|
1303
|
-
if isinstance(self.metrics[metric_name], (int, float)):
|
1304
|
-
self.metrics[metric_name] += value
|
1305
|
-
elif isinstance(self.metrics[metric_name], list):
|
1306
|
-
self.metrics[metric_name].append({
|
1307
|
-
'timestamp': datetime.datetime.now().isoformat(),
|
1308
|
-
'value': value
|
1309
|
-
})
|
1310
|
-
|
1311
|
-
def _get_system_metrics(self):
|
1312
|
-
"""获取系统资源使用指标"""
|
1313
|
-
if not self.enable_metrics:
|
1314
|
-
return {}
|
1315
|
-
|
1316
|
-
metrics = {
|
1317
|
-
'memory': psutil.virtual_memory().percent,
|
1318
|
-
'cpu': psutil.cpu_percent(),
|
1319
|
-
}
|
1320
|
-
|
1321
|
-
# 更安全的连接数获取方式
|
1322
|
-
if hasattr(self, 'pool') and self.pool is not None:
|
1323
|
-
try:
|
1324
|
-
# 对于不同的连接池实现可能有不同的属性名
|
1325
|
-
if hasattr(self.pool, '_connections'):
|
1326
|
-
connections = self.pool._connections
|
1327
|
-
metrics['connections'] = len(connections) if hasattr(connections, '__len__') else 0
|
1328
|
-
else:
|
1329
|
-
metrics['connections'] = 0
|
1330
|
-
except Exception:
|
1331
|
-
metrics['connections'] = 0
|
1332
|
-
else:
|
1333
|
-
metrics['connections'] = 0
|
1334
|
-
|
1335
|
-
return metrics
|
1336
|
-
|
1337
|
-
def _log_with_metrics(self, level: str, message: str, extra: Optional[Dict] = None):
|
1338
|
-
"""日志记录"""
|
1339
|
-
if not self.logger:
|
1340
|
-
return
|
1341
|
-
|
1342
|
-
if len(message) > 500:
|
1343
|
-
message = message[:500] + '...'
|
1344
|
-
|
1345
|
-
now = time.time()
|
1346
|
-
if now - self._last_metrics_time > self.metrics_interval:
|
1347
|
-
self._metrics_cache = self._get_system_metrics()
|
1348
|
-
# 使用缓存的指标
|
1349
|
-
log_extra = {'metrics': self._metrics_cache}
|
1350
|
-
self._last_metrics_time = now
|
1351
|
-
else:
|
1352
|
-
# 记录系统指标
|
1353
|
-
metrics = self._get_system_metrics()
|
1354
|
-
log_extra = {'metrics': metrics}
|
1355
|
-
|
1356
|
-
if extra:
|
1357
|
-
log_extra.update(extra)
|
1358
|
-
|
1359
|
-
getattr(self.logger, level.lower())(message, extra={'extra_data': log_extra})
|
1360
|
-
|
1361
1196
|
def _create_connection_pool(self) -> PooledDB:
|
1362
1197
|
"""创建数据库连接池"""
|
1363
1198
|
if hasattr(self, 'pool') and self.pool is not None and self._check_pool_health():
|
@@ -1386,7 +1221,7 @@ class MySQLUploader:
|
|
1386
1221
|
required_keys = {'ca', 'cert', 'key'}
|
1387
1222
|
if not all(k in self.ssl for k in required_keys):
|
1388
1223
|
error_msg = "SSL配置必须包含ca、cert和key"
|
1389
|
-
|
1224
|
+
logger.error(error_msg)
|
1390
1225
|
raise ValueError(error_msg)
|
1391
1226
|
pool_params['ssl'] = {
|
1392
1227
|
'ca': self.ssl['ca'],
|
@@ -1398,17 +1233,15 @@ class MySQLUploader:
|
|
1398
1233
|
try:
|
1399
1234
|
pool = PooledDB(**pool_params)
|
1400
1235
|
elapsed = time.time() - start_time
|
1401
|
-
|
1402
|
-
self._log_with_metrics('info', "连接池创建成功", {
|
1236
|
+
logger.info("连接池创建成功", {
|
1403
1237
|
'pool_size': self.pool_size,
|
1404
1238
|
'time_elapsed': elapsed
|
1405
1239
|
})
|
1406
1240
|
return pool
|
1407
1241
|
except Exception as e:
|
1408
1242
|
elapsed = time.time() - start_time
|
1409
|
-
self._record_metrics('connection_pool_failures', 1)
|
1410
1243
|
self.pool = None
|
1411
|
-
|
1244
|
+
logger.error("连接池创建失败", {
|
1412
1245
|
'error': str(e),
|
1413
1246
|
'time_elapsed': elapsed
|
1414
1247
|
})
|
@@ -1421,7 +1254,7 @@ class MySQLUploader:
|
|
1421
1254
|
start_time = time.time()
|
1422
1255
|
operation = func.__name__
|
1423
1256
|
|
1424
|
-
|
1257
|
+
logger.debug(f"开始执行操作: {operation}", {
|
1425
1258
|
'attempt': 1,
|
1426
1259
|
'max_retries': self.max_retries
|
1427
1260
|
})
|
@@ -1432,14 +1265,13 @@ class MySQLUploader:
|
|
1432
1265
|
elapsed = time.time() - start_time
|
1433
1266
|
|
1434
1267
|
if attempt > 0:
|
1435
|
-
|
1436
|
-
self._log_with_metrics('info', "操作成功(重试后)", {
|
1268
|
+
logger.info("操作成功(重试后)", {
|
1437
1269
|
'operation': operation,
|
1438
1270
|
'attempts': attempt + 1,
|
1439
1271
|
'time_elapsed': elapsed
|
1440
1272
|
})
|
1441
1273
|
else:
|
1442
|
-
|
1274
|
+
logger.debug("操作成功", {
|
1443
1275
|
'operation': operation,
|
1444
1276
|
'time_elapsed': elapsed
|
1445
1277
|
})
|
@@ -1448,7 +1280,6 @@ class MySQLUploader:
|
|
1448
1280
|
|
1449
1281
|
except (pymysql.OperationalError, pymysql.err.MySQLError) as e:
|
1450
1282
|
last_exception = e
|
1451
|
-
self._record_metrics('database_errors', 1)
|
1452
1283
|
|
1453
1284
|
# 记录详细的MySQL错误信息
|
1454
1285
|
error_details = {
|
@@ -1462,26 +1293,25 @@ class MySQLUploader:
|
|
1462
1293
|
if attempt < self.max_retries - 1:
|
1463
1294
|
wait_time = self.retry_interval * (attempt + 1)
|
1464
1295
|
error_details['wait_time'] = wait_time
|
1465
|
-
|
1296
|
+
logger.warning(f"数据库操作失败,准备重试 {error_details}", )
|
1466
1297
|
time.sleep(wait_time)
|
1467
1298
|
|
1468
1299
|
# 尝试重新连接
|
1469
1300
|
try:
|
1470
1301
|
self.pool = self._create_connection_pool()
|
1471
|
-
|
1302
|
+
logger.info("成功重新建立数据库连接")
|
1472
1303
|
except Exception as reconnect_error:
|
1473
|
-
|
1304
|
+
logger.error("重连失败", {
|
1474
1305
|
'error': str(reconnect_error)
|
1475
1306
|
})
|
1476
1307
|
else:
|
1477
1308
|
elapsed = time.time() - start_time
|
1478
1309
|
error_details['time_elapsed'] = elapsed
|
1479
|
-
|
1310
|
+
logger.error(f"操作最终失败 {error_details}")
|
1480
1311
|
|
1481
1312
|
except pymysql.IntegrityError as e:
|
1482
1313
|
elapsed = time.time() - start_time
|
1483
|
-
|
1484
|
-
self._log_with_metrics('error', "完整性约束错误", {
|
1314
|
+
logger.error("完整性约束错误", {
|
1485
1315
|
'operation': operation,
|
1486
1316
|
'time_elapsed': elapsed,
|
1487
1317
|
'error_code': e.args[0] if e.args else None,
|
@@ -1492,8 +1322,7 @@ class MySQLUploader:
|
|
1492
1322
|
except Exception as e:
|
1493
1323
|
last_exception = e
|
1494
1324
|
elapsed = time.time() - start_time
|
1495
|
-
|
1496
|
-
self._log_with_metrics('error', "发生意外错误", {
|
1325
|
+
logger.error("发生意外错误", {
|
1497
1326
|
'operation': operation,
|
1498
1327
|
'time_elapsed': elapsed,
|
1499
1328
|
'error_type': type(e).__name__,
|
@@ -1510,10 +1339,10 @@ class MySQLUploader:
|
|
1510
1339
|
"""从连接池获取连接"""
|
1511
1340
|
try:
|
1512
1341
|
conn = self.pool.connection()
|
1513
|
-
|
1342
|
+
logger.debug("获取数据库连接")
|
1514
1343
|
return conn
|
1515
1344
|
except Exception as e:
|
1516
|
-
|
1345
|
+
logger.error(f'{e}')
|
1517
1346
|
raise ConnectionError(f"连接数据库失败: {str(e)}")
|
1518
1347
|
|
1519
1348
|
def _check_database_exists(self, db_name: str) -> bool:
|
@@ -1526,10 +1355,10 @@ class MySQLUploader:
|
|
1526
1355
|
with conn.cursor() as cursor:
|
1527
1356
|
cursor.execute(sql, (db_name,))
|
1528
1357
|
exists = bool(cursor.fetchone())
|
1529
|
-
|
1358
|
+
logger.debug(f"{db_name} 数据库已存在: {exists}")
|
1530
1359
|
return exists
|
1531
1360
|
except Exception as e:
|
1532
|
-
|
1361
|
+
logger.error(f"检查数据库是否存在时出错: {str(e)}")
|
1533
1362
|
raise
|
1534
1363
|
|
1535
1364
|
def _create_database(self, db_name: str):
|
@@ -1542,9 +1371,9 @@ class MySQLUploader:
|
|
1542
1371
|
with conn.cursor() as cursor:
|
1543
1372
|
cursor.execute(sql)
|
1544
1373
|
conn.commit()
|
1545
|
-
|
1374
|
+
logger.info(f"{db_name} 数据库已创建")
|
1546
1375
|
except Exception as e:
|
1547
|
-
|
1376
|
+
logger.error(f"{db_name}: 无法创建数据库 {str(e)}")
|
1548
1377
|
conn.rollback()
|
1549
1378
|
raise
|
1550
1379
|
|
@@ -1567,7 +1396,7 @@ class MySQLUploader:
|
|
1567
1396
|
date_obj = self._validate_datetime(date_value, True)
|
1568
1397
|
except ValueError:
|
1569
1398
|
error_msg = f"无效的日期格式1: {date_value}"
|
1570
|
-
|
1399
|
+
logger.error(error_msg)
|
1571
1400
|
raise ValueError(error_msg)
|
1572
1401
|
|
1573
1402
|
if partition_by == 'year':
|
@@ -1576,7 +1405,7 @@ class MySQLUploader:
|
|
1576
1405
|
return f"{table_name}_{date_obj.year}_{date_obj.month:02d}"
|
1577
1406
|
else:
|
1578
1407
|
error_msg = "partition_by must be 'year' or 'month'"
|
1579
|
-
|
1408
|
+
logger.error(error_msg)
|
1580
1409
|
raise ValueError(error_msg)
|
1581
1410
|
|
1582
1411
|
def _validate_identifier(self, identifier: str) -> str:
|
@@ -1590,14 +1419,14 @@ class MySQLUploader:
|
|
1590
1419
|
"""
|
1591
1420
|
if not identifier or not isinstance(identifier, str):
|
1592
1421
|
error_msg = f"无效的标识符: {identifier}"
|
1593
|
-
|
1422
|
+
logger.error(error_msg)
|
1594
1423
|
raise ValueError(error_msg)
|
1595
1424
|
|
1596
1425
|
# 移除非法字符,只保留字母、数字、下划线和美元符号
|
1597
1426
|
cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '', identifier)
|
1598
1427
|
if not cleaned:
|
1599
1428
|
error_msg = f"无法清理异常标识符: {identifier}"
|
1600
|
-
|
1429
|
+
logger.error(error_msg)
|
1601
1430
|
raise ValueError(error_msg)
|
1602
1431
|
|
1603
1432
|
# 检查是否为MySQL保留字
|
@@ -1606,7 +1435,7 @@ class MySQLUploader:
|
|
1606
1435
|
'not', 'like', 'in', 'is', 'null', 'true', 'false', 'between'
|
1607
1436
|
}
|
1608
1437
|
if cleaned.lower() in mysql_keywords:
|
1609
|
-
|
1438
|
+
logger.debug(f"存在MySQL保留字: {cleaned}")
|
1610
1439
|
return f"`{cleaned}`"
|
1611
1440
|
|
1612
1441
|
return cleaned
|
@@ -1633,7 +1462,7 @@ class MySQLUploader:
|
|
1633
1462
|
cursor.execute(sql, (db_name, table_name))
|
1634
1463
|
result = bool(cursor.fetchone())
|
1635
1464
|
except Exception as e:
|
1636
|
-
|
1465
|
+
logger.error(f"检查数据表是否存在时发生未知错误: {e}", )
|
1637
1466
|
raise
|
1638
1467
|
|
1639
1468
|
# 执行查询并缓存结果
|
@@ -1665,7 +1494,7 @@ class MySQLUploader:
|
|
1665
1494
|
|
1666
1495
|
if not set_typ:
|
1667
1496
|
error_msg = "No columns specified for table creation"
|
1668
|
-
|
1497
|
+
logger.error(error_msg)
|
1669
1498
|
raise ValueError(error_msg)
|
1670
1499
|
|
1671
1500
|
# 构建列定义SQL
|
@@ -1710,7 +1539,7 @@ class MySQLUploader:
|
|
1710
1539
|
with self._get_connection() as conn:
|
1711
1540
|
with conn.cursor() as cursor:
|
1712
1541
|
cursor.execute(sql)
|
1713
|
-
|
1542
|
+
logger.info(f"{db_name}.{table_name}: 数据表已创建")
|
1714
1543
|
|
1715
1544
|
# 添加普通索引
|
1716
1545
|
index_statements = []
|
@@ -1736,13 +1565,13 @@ class MySQLUploader:
|
|
1736
1565
|
with conn.cursor() as cursor:
|
1737
1566
|
for stmt in index_statements:
|
1738
1567
|
cursor.execute(stmt)
|
1739
|
-
|
1568
|
+
logger.debug(f"Executed index statement: {stmt}", )
|
1740
1569
|
|
1741
1570
|
conn.commit()
|
1742
|
-
|
1571
|
+
logger.info(f"{db_name}.{table_name}: 索引已添加")
|
1743
1572
|
|
1744
1573
|
except Exception as e:
|
1745
|
-
|
1574
|
+
logger.error(f"{db_name}.{table_name}: 建表失败: {str(e)}")
|
1746
1575
|
conn.rollback()
|
1747
1576
|
raise
|
1748
1577
|
|
@@ -1812,7 +1641,7 @@ class MySQLUploader:
|
|
1812
1641
|
return value
|
1813
1642
|
except (ValueError, TypeError) as e:
|
1814
1643
|
error_msg = f"数据类型转换异常 {value} to type {column_type}: {str(e)}"
|
1815
|
-
|
1644
|
+
logger.error(error_msg)
|
1816
1645
|
raise ValueError(error_msg)
|
1817
1646
|
|
1818
1647
|
def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
|
@@ -1831,10 +1660,10 @@ class MySQLUploader:
|
|
1831
1660
|
with conn.cursor() as cursor:
|
1832
1661
|
cursor.execute(sql, (db_name, table_name))
|
1833
1662
|
set_typ = {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
|
1834
|
-
|
1663
|
+
logger.debug(f"{db_name}.{table_name}: 获取表的列信息: {set_typ}")
|
1835
1664
|
return set_typ
|
1836
1665
|
except Exception as e:
|
1837
|
-
|
1666
|
+
logger.error(f"无法获取表列信息: {str(e)}")
|
1838
1667
|
raise
|
1839
1668
|
|
1840
1669
|
def _upload_to_table(
|
@@ -1860,21 +1689,21 @@ class MySQLUploader:
|
|
1860
1689
|
allow_null=allow_null)
|
1861
1690
|
else:
|
1862
1691
|
error_msg = f"数据表不存在: '{db_name}.{table_name}'"
|
1863
|
-
|
1692
|
+
logger.error(error_msg)
|
1864
1693
|
raise ValueError(error_msg)
|
1865
1694
|
|
1866
1695
|
# 获取表结构并验证
|
1867
1696
|
table_columns = self._get_table_columns(db_name, table_name)
|
1868
1697
|
if not table_columns:
|
1869
1698
|
error_msg = f"获取列失败 '{db_name}.{table_name}'"
|
1870
|
-
|
1699
|
+
logger.error(error_msg)
|
1871
1700
|
raise ValueError(error_msg)
|
1872
1701
|
|
1873
1702
|
# 验证数据列与表列匹配
|
1874
1703
|
for col in set_typ:
|
1875
1704
|
if col not in table_columns:
|
1876
1705
|
error_msg = f"列不存在: '{col}' -> '{db_name}.{table_name}'"
|
1877
|
-
|
1706
|
+
logger.error(error_msg)
|
1878
1707
|
raise ValueError(error_msg)
|
1879
1708
|
|
1880
1709
|
# 插入数据
|
@@ -1957,7 +1786,7 @@ class MySQLUploader:
|
|
1957
1786
|
data.columns = [col.lower() for col in data.columns]
|
1958
1787
|
data = data.replace({pd.NA: None}).to_dict('records')
|
1959
1788
|
except Exception as e:
|
1960
|
-
|
1789
|
+
logger.error(f"数据转字典时发生错误: {e}", )
|
1961
1790
|
raise ValueError(f"数据转字典时发生错误: {e}")
|
1962
1791
|
elif isinstance(data, dict):
|
1963
1792
|
data = [{k.lower(): v for k, v in data.items()}]
|
@@ -1966,7 +1795,7 @@ class MySQLUploader:
|
|
1966
1795
|
data = [{k.lower(): v for k, v in item.items()} for item in data]
|
1967
1796
|
else:
|
1968
1797
|
error_msg = "数据结构必须是字典、列表、字典列表或dataframe"
|
1969
|
-
|
1798
|
+
logger.error(error_msg)
|
1970
1799
|
raise ValueError(error_msg)
|
1971
1800
|
|
1972
1801
|
# 将set_typ的键转为小写
|
@@ -1988,11 +1817,11 @@ class MySQLUploader:
|
|
1988
1817
|
if sample_values:
|
1989
1818
|
inferred_type = self._infer_data_type(sample_values[0])
|
1990
1819
|
filtered_set_typ[col] = inferred_type
|
1991
|
-
|
1820
|
+
logger.debug(f"自动推断列'{col}'的数据类型为: {inferred_type}")
|
1992
1821
|
else:
|
1993
1822
|
# 没有样本值,使用默认类型
|
1994
1823
|
filtered_set_typ[col] = 'VARCHAR(255)'
|
1995
|
-
|
1824
|
+
logger.debug(f"为列'{col}'使用默认数据类型: VARCHAR(255)")
|
1996
1825
|
|
1997
1826
|
prepared_data = []
|
1998
1827
|
for row_idx, row in enumerate(data, 1):
|
@@ -2005,7 +1834,7 @@ class MySQLUploader:
|
|
2005
1834
|
if col_name not in row:
|
2006
1835
|
if not allow_null:
|
2007
1836
|
error_msg = f"Row {row_idx}: Missing required column '{col_name}' in data"
|
2008
|
-
|
1837
|
+
logger.error(error_msg)
|
2009
1838
|
raise ValueError(error_msg)
|
2010
1839
|
prepared_row[col_name] = None
|
2011
1840
|
else:
|
@@ -2013,11 +1842,11 @@ class MySQLUploader:
|
|
2013
1842
|
prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name])
|
2014
1843
|
except ValueError as e:
|
2015
1844
|
error_msg = f"Row {row_idx}, column '{col_name}': {str(e)}"
|
2016
|
-
|
1845
|
+
logger.error(error_msg)
|
2017
1846
|
raise ValueError(error_msg)
|
2018
1847
|
prepared_data.append(prepared_row)
|
2019
1848
|
|
2020
|
-
|
1849
|
+
logger.debug(f"已准备 {len(prepared_data)} 行数据")
|
2021
1850
|
return prepared_data, filtered_set_typ
|
2022
1851
|
|
2023
1852
|
def upload_data(
|
@@ -2039,14 +1868,12 @@ class MySQLUploader:
|
|
2039
1868
|
上传数据到数据库
|
2040
1869
|
"""
|
2041
1870
|
upload_start = time.time()
|
2042
|
-
self._record_metrics('total_uploads', 1)
|
2043
1871
|
initial_row_count = len(data) if hasattr(data, '__len__') else 1
|
2044
|
-
self.metrics['total_rows'] += len(data) if hasattr(data, '__len__') else 1
|
2045
1872
|
|
2046
1873
|
batch_id = f"batch_{int(time.time() * 1000)}"
|
2047
1874
|
success_flag = False
|
2048
1875
|
|
2049
|
-
|
1876
|
+
logger.info("开始上传数据", {
|
2050
1877
|
'batch_id': batch_id,
|
2051
1878
|
'database': db_name,
|
2052
1879
|
'table': table_name,
|
@@ -2060,12 +1887,12 @@ class MySQLUploader:
|
|
2060
1887
|
# 验证参数
|
2061
1888
|
if not set_typ:
|
2062
1889
|
error_msg = "列的数据类型缺失"
|
2063
|
-
|
1890
|
+
logger.error(error_msg)
|
2064
1891
|
raise ValueError(error_msg)
|
2065
1892
|
|
2066
1893
|
if partition_by and partition_by not in ['year', 'month']:
|
2067
1894
|
error_msg = "分表方式必须是 'year' 或 'month'"
|
2068
|
-
|
1895
|
+
logger.error(error_msg)
|
2069
1896
|
raise ValueError(error_msg)
|
2070
1897
|
|
2071
1898
|
# 准备数据
|
@@ -2077,7 +1904,7 @@ class MySQLUploader:
|
|
2077
1904
|
self._create_database(db_name)
|
2078
1905
|
else:
|
2079
1906
|
error_msg = f"数据库不存在: '{db_name}'"
|
2080
|
-
|
1907
|
+
logger.error(error_msg)
|
2081
1908
|
raise ValueError(error_msg)
|
2082
1909
|
|
2083
1910
|
# 处理分表逻辑
|
@@ -2087,7 +1914,7 @@ class MySQLUploader:
|
|
2087
1914
|
try:
|
2088
1915
|
if partition_date_column not in row:
|
2089
1916
|
error_msg = f"异常缺失列 '{partition_date_column}'"
|
2090
|
-
|
1917
|
+
logger.error(error_msg)
|
2091
1918
|
continue # 跳过当前行
|
2092
1919
|
|
2093
1920
|
part_table = self._get_partition_table_name(
|
@@ -2099,7 +1926,7 @@ class MySQLUploader:
|
|
2099
1926
|
partitioned_data[part_table] = []
|
2100
1927
|
partitioned_data[part_table].append(row)
|
2101
1928
|
except Exception as e:
|
2102
|
-
|
1929
|
+
logger.error("分表处理失败", {
|
2103
1930
|
'row_data': row,
|
2104
1931
|
'error': str(e)
|
2105
1932
|
})
|
@@ -2115,7 +1942,7 @@ class MySQLUploader:
|
|
2115
1942
|
indexes, batch_id
|
2116
1943
|
)
|
2117
1944
|
except Exception as e:
|
2118
|
-
|
1945
|
+
logger.error("分表上传失败", {
|
2119
1946
|
'partition_table': part_table,
|
2120
1947
|
'error': str(e)
|
2121
1948
|
})
|
@@ -2132,25 +1959,17 @@ class MySQLUploader:
|
|
2132
1959
|
success_flag = True
|
2133
1960
|
|
2134
1961
|
except Exception as e:
|
2135
|
-
|
1962
|
+
logger.error("上传过程中发生全局错误", {
|
2136
1963
|
'error': str(e),
|
2137
1964
|
'error_type': type(e).__name__
|
2138
1965
|
})
|
2139
1966
|
finally:
|
2140
1967
|
elapsed = time.time() - upload_start
|
2141
|
-
|
2142
|
-
|
2143
|
-
if success_flag:
|
2144
|
-
self._record_metrics('successful_uploads', 1)
|
2145
|
-
else:
|
2146
|
-
self._record_metrics('failed_uploads', 1)
|
2147
|
-
|
2148
|
-
self._log_with_metrics('info', "上传处理完成", {
|
1968
|
+
logger.info("上传处理完成", {
|
2149
1969
|
'batch_id': batch_id,
|
2150
1970
|
'success': success_flag,
|
2151
1971
|
'time_elapsed': elapsed,
|
2152
|
-
'initial_row_count': initial_row_count
|
2153
|
-
'processed_rows': self.metrics['successful_rows'] + self.metrics['failed_rows']
|
1972
|
+
'initial_row_count': initial_row_count
|
2154
1973
|
})
|
2155
1974
|
|
2156
1975
|
def _insert_data(
|
@@ -2263,7 +2082,7 @@ class MySQLUploader:
|
|
2263
2082
|
'duplicate_check': check_duplicate,
|
2264
2083
|
'duplicate_columns': duplicate_columns
|
2265
2084
|
}
|
2266
|
-
|
2085
|
+
logger.error(f"单行插入失败: {error_details}")
|
2267
2086
|
continue # 跳过当前行,继续处理下一行
|
2268
2087
|
|
2269
2088
|
# 更新统计信息
|
@@ -2276,8 +2095,6 @@ class MySQLUploader:
|
|
2276
2095
|
total_inserted += successful_rows
|
2277
2096
|
|
2278
2097
|
batch_elapsed = time.time() - batch_start
|
2279
|
-
self._record_metrics('batch_execution_time', batch_elapsed, is_timing=True)
|
2280
|
-
|
2281
2098
|
batch_info = {
|
2282
2099
|
'batch_id': batch_id,
|
2283
2100
|
'batch_index': i // batch_size + 1,
|
@@ -2288,68 +2105,42 @@ class MySQLUploader:
|
|
2288
2105
|
'time_elapsed': batch_elapsed,
|
2289
2106
|
'rows_per_second': successful_rows / batch_elapsed if batch_elapsed > 0 else 0
|
2290
2107
|
}
|
2291
|
-
|
2108
|
+
logger.debug(f"批次处理完成 {batch_info}")
|
2292
2109
|
|
2293
|
-
|
2294
|
-
self.metrics['failed_rows'] += total_failed
|
2295
|
-
self._log_with_metrics('info', "数据插入完成", {
|
2110
|
+
logger.info("数据插入完成", {
|
2296
2111
|
'total_rows': len(data),
|
2297
2112
|
'inserted_rows': total_inserted,
|
2298
2113
|
'skipped_rows': total_skipped,
|
2299
2114
|
'failed_rows': total_failed
|
2300
2115
|
})
|
2301
2116
|
|
2302
|
-
def get_metrics(self) -> Dict:
|
2303
|
-
"""获取当前性能指标"""
|
2304
|
-
metrics = self.metrics.copy()
|
2305
|
-
|
2306
|
-
# 添加当前系统指标
|
2307
|
-
metrics.update({
|
2308
|
-
'current_time': datetime.datetime.now().isoformat(),
|
2309
|
-
'system': self._get_system_metrics(),
|
2310
|
-
'connection_pool': {
|
2311
|
-
'size': self.pool_size,
|
2312
|
-
'active': len(self.pool._connections) if hasattr(self.pool, '_connections') else 0
|
2313
|
-
}
|
2314
|
-
})
|
2315
|
-
|
2316
|
-
return metrics
|
2317
|
-
|
2318
2117
|
def close(self):
|
2319
2118
|
"""关闭连接池并记录最终指标"""
|
2320
2119
|
close_start = time.time()
|
2321
2120
|
|
2322
2121
|
try:
|
2323
2122
|
if hasattr(self, 'pool') and self.pool is not None:
|
2324
|
-
# 记录关闭前的连接池状态
|
2325
|
-
active_connections = self._get_system_metrics().get('connections', 0)
|
2326
|
-
|
2327
2123
|
# 更安全的关闭方式
|
2328
2124
|
try:
|
2329
2125
|
self.pool.close()
|
2330
2126
|
except Exception as e:
|
2331
|
-
|
2127
|
+
logger.warning("关闭连接池时出错", {
|
2332
2128
|
'error': str(e)
|
2333
2129
|
})
|
2334
2130
|
|
2335
2131
|
self.pool = None
|
2336
2132
|
|
2337
|
-
elapsed = time.time() - close_start
|
2338
|
-
|
2339
|
-
'active_connections_before_close': active_connections,
|
2133
|
+
elapsed = round(time.time() - close_start, 2)
|
2134
|
+
logger.info("连接池已关闭", {
|
2340
2135
|
'close_time_elapsed': elapsed
|
2341
2136
|
})
|
2342
2137
|
except Exception as e:
|
2343
|
-
elapsed = time.time() - close_start
|
2344
|
-
|
2138
|
+
elapsed = round(time.time() - close_start, 2)
|
2139
|
+
logger.error("关闭连接池失败", {
|
2345
2140
|
'error': str(e),
|
2346
2141
|
'close_time_elapsed': elapsed
|
2347
2142
|
})
|
2348
2143
|
raise
|
2349
|
-
finally:
|
2350
|
-
# 记录最终性能指标
|
2351
|
-
if hasattr(self, 'logger') and self.logger and self.enable_metrics:
|
2352
|
-
self._log_with_metrics('debug', "最终性能指标", self.get_metrics())
|
2353
2144
|
|
2354
2145
|
def _check_pool_health(self):
|
2355
2146
|
"""定期检查连接池健康状态"""
|
@@ -2358,8 +2149,8 @@ class MySQLUploader:
|
|
2358
2149
|
conn.ping(reconnect=True)
|
2359
2150
|
conn.close()
|
2360
2151
|
return True
|
2361
|
-
except Exception:
|
2362
|
-
|
2152
|
+
except Exception as e:
|
2153
|
+
logger.warning("连接池健康检查失败", {
|
2363
2154
|
'error': str(e)
|
2364
2155
|
})
|
2365
2156
|
return False
|
@@ -2428,9 +2219,6 @@ class MySQLDeduplicator:
|
|
2428
2219
|
max_workers: int = 1,
|
2429
2220
|
batch_size: int = 1000,
|
2430
2221
|
skip_system_dbs: bool = True,
|
2431
|
-
logging_mode: str = 'console',
|
2432
|
-
log_level: str = 'INFO',
|
2433
|
-
log_file: str = 'mysql_deduplicate.log',
|
2434
2222
|
max_retries: int = 3,
|
2435
2223
|
retry_interval: int = 5,
|
2436
2224
|
pool_size: int = 5
|
@@ -2446,9 +2234,6 @@ class MySQLDeduplicator:
|
|
2446
2234
|
:param max_workers: 最大工作线程数,默认为1(单线程)
|
2447
2235
|
:param batch_size: 批量处理大小,默认为1000
|
2448
2236
|
:param skip_system_dbs: 是否跳过系统数据库,默认为True
|
2449
|
-
:param logging_mode: 日志模式('console', 'file', 'both', 'none')
|
2450
|
-
:param log_level: 日志级别('DEBUG', 'INFO', 'WARNING', 'ERROR')
|
2451
|
-
:param log_file: 日志文件路径
|
2452
2237
|
:param max_retries: 最大重试次数
|
2453
2238
|
:param retry_interval: 重试间隔(秒)
|
2454
2239
|
:param pool_size: 连接池大小
|
@@ -2476,69 +2261,20 @@ class MySQLDeduplicator:
|
|
2476
2261
|
self._lock = threading.Lock()
|
2477
2262
|
self._processing_tables = set() # 正在处理的表集合
|
2478
2263
|
|
2479
|
-
# 初始化日志
|
2480
|
-
self._init_logging(logging_mode, log_level, log_file)
|
2481
|
-
|
2482
2264
|
# 系统数据库列表
|
2483
2265
|
self.SYSTEM_DATABASES = {
|
2484
2266
|
'information_schema', 'mysql',
|
2485
2267
|
'performance_schema', 'sys'
|
2486
2268
|
}
|
2487
2269
|
|
2488
|
-
def _init_logging(
|
2489
|
-
self,
|
2490
|
-
logging_mode: str,
|
2491
|
-
log_level: str,
|
2492
|
-
log_file: str
|
2493
|
-
):
|
2494
|
-
"""初始化日志配置"""
|
2495
|
-
self.logger = logging.getLogger('mysql_deduplicator')
|
2496
|
-
self.logger.setLevel(log_level.upper())
|
2497
|
-
|
2498
|
-
# 防止重复添加handler
|
2499
|
-
if self.logger.handlers:
|
2500
|
-
for handler in self.logger.handlers[:]:
|
2501
|
-
self.logger.removeHandler(handler)
|
2502
|
-
|
2503
|
-
formatter = logging.Formatter(
|
2504
|
-
'%(asctime)s - %(levelname)s - %(message)s',
|
2505
|
-
datefmt='%Y-%m-%d %H:%M:%S'
|
2506
|
-
)
|
2507
|
-
|
2508
|
-
mode = logging_mode.lower()
|
2509
|
-
if mode in ('both', 'console'):
|
2510
|
-
console_handler = logging.StreamHandler()
|
2511
|
-
console_handler.setFormatter(formatter)
|
2512
|
-
self.logger.addHandler(console_handler)
|
2513
|
-
|
2514
|
-
if mode in ('both', 'file'):
|
2515
|
-
file_handler = logging.FileHandler(
|
2516
|
-
filename=log_file,
|
2517
|
-
encoding='utf-8'
|
2518
|
-
)
|
2519
|
-
file_handler.setFormatter(formatter)
|
2520
|
-
self.logger.addHandler(file_handler)
|
2521
|
-
|
2522
|
-
def _log(self, level: str, message: str, extra: Optional[Dict] = None):
|
2523
|
-
"""统一的日志记录方法"""
|
2524
|
-
if not hasattr(self.logger, level.lower()):
|
2525
|
-
return
|
2526
|
-
|
2527
|
-
# 简化日志内容,避免过长
|
2528
|
-
if len(message) > 500:
|
2529
|
-
message = message[:500] + '...'
|
2530
|
-
|
2531
|
-
log_method = getattr(self.logger, level.lower())
|
2532
|
-
log_method(message, extra=extra)
|
2533
|
-
|
2534
2270
|
def _get_connection(self):
|
2535
2271
|
"""从连接池获取连接"""
|
2536
2272
|
try:
|
2537
2273
|
conn = self.pool.connection()
|
2538
|
-
|
2274
|
+
logger.debug("成功获取数据库连接")
|
2539
2275
|
return conn
|
2540
2276
|
except Exception as e:
|
2541
|
-
|
2277
|
+
logger.error(f"获取数据库连接失败: {str(e)}")
|
2542
2278
|
raise ConnectionError(f"连接数据库失败: {str(e)}")
|
2543
2279
|
|
2544
2280
|
@staticmethod
|
@@ -2555,16 +2291,14 @@ class MySQLDeduplicator:
|
|
2555
2291
|
last_exception = e
|
2556
2292
|
if attempt < self.max_retries:
|
2557
2293
|
wait_time = self.retry_interval * (attempt + 1)
|
2558
|
-
|
2559
|
-
|
2560
|
-
|
2294
|
+
logger.warning(
|
2295
|
+
f"数据库操作失败,准备重试 (尝试 {attempt + 1}/{self.max_retries})",
|
2296
|
+
{'error': str(e), 'wait_time': wait_time})
|
2561
2297
|
time.sleep(wait_time)
|
2562
2298
|
continue
|
2563
2299
|
except Exception as e:
|
2564
2300
|
last_exception = e
|
2565
|
-
|
2566
|
-
f"操作失败: {str(e)}",
|
2567
|
-
{'error_type': type(e).__name__})
|
2301
|
+
logger.error(f"操作失败: {str(e)}", {'error_type': type(e).__name__})
|
2568
2302
|
break
|
2569
2303
|
|
2570
2304
|
if last_exception:
|
@@ -2620,7 +2354,7 @@ class MySQLDeduplicator:
|
|
2620
2354
|
|
2621
2355
|
with self._lock:
|
2622
2356
|
if key in self._processing_tables:
|
2623
|
-
|
2357
|
+
logger.debug(f"表 {key} 正在被其他线程处理,跳过")
|
2624
2358
|
return False
|
2625
2359
|
self._processing_tables.add(key)
|
2626
2360
|
return True
|
@@ -2653,12 +2387,12 @@ class MySQLDeduplicator:
|
|
2653
2387
|
return (0, 0)
|
2654
2388
|
|
2655
2389
|
try:
|
2656
|
-
|
2390
|
+
logger.info(f"开始处理表: {database}.{table}")
|
2657
2391
|
|
2658
2392
|
# 获取实际列名
|
2659
2393
|
all_columns = self._get_table_columns(database, table)
|
2660
2394
|
if not all_columns:
|
2661
|
-
|
2395
|
+
logger.warning(f"表 {database}.{table} 没有有效列(可能只有id列),跳过")
|
2662
2396
|
return (0, 0)
|
2663
2397
|
|
2664
2398
|
# 使用指定列或所有列
|
@@ -2666,14 +2400,14 @@ class MySQLDeduplicator:
|
|
2666
2400
|
invalid_columns = set(use_columns) - set(all_columns)
|
2667
2401
|
|
2668
2402
|
if invalid_columns:
|
2669
|
-
|
2670
|
-
|
2671
|
-
|
2672
|
-
|
2403
|
+
logger.warning(
|
2404
|
+
f"表 {database}.{table} 中不存在以下列: {invalid_columns},使用有效列",
|
2405
|
+
{'invalid_columns': invalid_columns}
|
2406
|
+
)
|
2673
2407
|
use_columns = [col for col in use_columns if col in all_columns]
|
2674
2408
|
|
2675
2409
|
if not use_columns:
|
2676
|
-
|
2410
|
+
logger.error(f"表 {database}.{table} 没有有效的去重列")
|
2677
2411
|
return (0, 0)
|
2678
2412
|
|
2679
2413
|
# 构建去重SQL
|
@@ -2706,31 +2440,31 @@ class MySQLDeduplicator:
|
|
2706
2440
|
dup_count = cursor.fetchone()['cnt']
|
2707
2441
|
|
2708
2442
|
if dup_count == 0:
|
2709
|
-
|
2443
|
+
logger.info(f"表 {database}.{table} 没有重复数据")
|
2710
2444
|
cursor.execute(drop_temp_sql)
|
2711
2445
|
conn.commit()
|
2712
2446
|
return (0, 0)
|
2713
2447
|
|
2714
|
-
|
2715
|
-
|
2716
|
-
|
2717
|
-
|
2448
|
+
logger.info(
|
2449
|
+
f"表 {database}.{table} 发现 {dup_count} 组重复数据",
|
2450
|
+
{'columns': use_columns}
|
2451
|
+
)
|
2718
2452
|
|
2719
2453
|
if not dry_run:
|
2720
2454
|
# 执行实际删除
|
2721
2455
|
cursor.execute(delete_dup_sql)
|
2722
2456
|
affected_rows = cursor.rowcount
|
2723
2457
|
conn.commit()
|
2724
|
-
|
2725
|
-
|
2726
|
-
|
2727
|
-
|
2458
|
+
logger.info(
|
2459
|
+
f"表 {database}.{table} 已删除 {affected_rows} 行重复数据",
|
2460
|
+
{'columns': use_columns}
|
2461
|
+
)
|
2728
2462
|
else:
|
2729
2463
|
affected_rows = 0
|
2730
|
-
|
2731
|
-
|
2732
|
-
|
2733
|
-
|
2464
|
+
logger.info(
|
2465
|
+
f"[模拟运行] 表 {database}.{table} 将删除 {dup_count} 组重复数据",
|
2466
|
+
{'columns': use_columns}
|
2467
|
+
)
|
2734
2468
|
|
2735
2469
|
# 清理临时表
|
2736
2470
|
cursor.execute(drop_temp_sql)
|
@@ -2739,10 +2473,10 @@ class MySQLDeduplicator:
|
|
2739
2473
|
return (dup_count, affected_rows)
|
2740
2474
|
|
2741
2475
|
except Exception as e:
|
2742
|
-
|
2743
|
-
|
2744
|
-
|
2745
|
-
|
2476
|
+
logger.error(
|
2477
|
+
f"处理表 {database}.{table} 时出错: {str(e)}",
|
2478
|
+
{'error_type': type(e).__name__}
|
2479
|
+
)
|
2746
2480
|
return (0, 0)
|
2747
2481
|
finally:
|
2748
2482
|
self._release_table_lock(database, table)
|
@@ -2766,15 +2500,15 @@ class MySQLDeduplicator:
|
|
2766
2500
|
try:
|
2767
2501
|
# 检查表是否存在
|
2768
2502
|
if not self._check_table_exists(database, table):
|
2769
|
-
|
2503
|
+
logger.warning(f"表 {database}.{table} 不存在,跳过")
|
2770
2504
|
return (0, 0)
|
2771
2505
|
|
2772
2506
|
return self._deduplicate_table(database, table, columns, dry_run)
|
2773
2507
|
except Exception as e:
|
2774
|
-
|
2775
|
-
|
2776
|
-
|
2777
|
-
|
2508
|
+
logger.error(
|
2509
|
+
f"处理表 {database}.{table} 时发生全局错误: {str(e)}",
|
2510
|
+
{'error_type': type(e).__name__}
|
2511
|
+
)
|
2778
2512
|
return (0, 0)
|
2779
2513
|
|
2780
2514
|
def deduplicate_database(
|
@@ -2800,19 +2534,19 @@ class MySQLDeduplicator:
|
|
2800
2534
|
try:
|
2801
2535
|
# 检查数据库是否存在
|
2802
2536
|
if not self._check_database_exists(database):
|
2803
|
-
|
2537
|
+
logger.warning(f"数据库 {database} 不存在,跳过")
|
2804
2538
|
return results
|
2805
2539
|
|
2806
2540
|
# 获取要处理的表
|
2807
2541
|
target_tables = tables or self._get_tables(database)
|
2808
2542
|
if not target_tables:
|
2809
|
-
|
2543
|
+
logger.info(f"数据库 {database} 中没有表,跳过")
|
2810
2544
|
return results
|
2811
2545
|
|
2812
|
-
|
2813
|
-
|
2814
|
-
|
2815
|
-
|
2546
|
+
logger.info(
|
2547
|
+
f"开始处理数据库 {database} 中的 {len(target_tables)} 张表",
|
2548
|
+
{'tables': target_tables}
|
2549
|
+
)
|
2816
2550
|
|
2817
2551
|
if parallel and self.max_workers > 1:
|
2818
2552
|
# 并行处理
|
@@ -2833,10 +2567,10 @@ class MySQLDeduplicator:
|
|
2833
2567
|
dup_count, affected_rows = future.result()
|
2834
2568
|
results[table] = (dup_count, affected_rows)
|
2835
2569
|
except Exception as e:
|
2836
|
-
|
2837
|
-
|
2838
|
-
|
2839
|
-
|
2570
|
+
logger.error(
|
2571
|
+
f"处理表 {database}.{table} 时出错: {str(e)}",
|
2572
|
+
{'error_type': type(e).__name__}
|
2573
|
+
)
|
2840
2574
|
results[table] = (0, 0)
|
2841
2575
|
else:
|
2842
2576
|
# 串行处理
|
@@ -2851,18 +2585,15 @@ class MySQLDeduplicator:
|
|
2851
2585
|
total_dup = sum(r[0] for r in results.values())
|
2852
2586
|
total_del = sum(r[1] for r in results.values())
|
2853
2587
|
|
2854
|
-
|
2855
|
-
|
2856
|
-
|
2857
|
-
|
2588
|
+
logger.info(
|
2589
|
+
f"数据库 {database} 处理完成 - 共发现 {total_dup} 组重复数据,删除 {total_del} 行",
|
2590
|
+
{'results': results}
|
2591
|
+
)
|
2858
2592
|
|
2859
2593
|
return results
|
2860
2594
|
|
2861
2595
|
except Exception as e:
|
2862
|
-
|
2863
|
-
f"处理数据库 {database} 时发生全局错误: {str(e)}",
|
2864
|
-
{'error_type': type(e).__name__}
|
2865
|
-
)
|
2596
|
+
logger.error(f"处理数据库 {database} 时发生全局错误: {str(e)}", {'error_type': type(e).__name__})
|
2866
2597
|
return results
|
2867
2598
|
|
2868
2599
|
def deduplicate_all(
|
@@ -2889,13 +2620,10 @@ class MySQLDeduplicator:
|
|
2889
2620
|
# 获取要处理的数据库
|
2890
2621
|
target_dbs = databases or self._get_databases()
|
2891
2622
|
if not target_dbs:
|
2892
|
-
|
2623
|
+
logger.warning("没有可处理的数据库")
|
2893
2624
|
return all_results
|
2894
2625
|
|
2895
|
-
|
2896
|
-
f"开始处理 {len(target_dbs)} 个数据库",
|
2897
|
-
{'databases': target_dbs}
|
2898
|
-
)
|
2626
|
+
logger.info(f"开始处理 {len(target_dbs)} 个数据库", {'databases': target_dbs})
|
2899
2627
|
|
2900
2628
|
if parallel and self.max_workers > 1:
|
2901
2629
|
# 并行处理数据库
|
@@ -2917,10 +2645,7 @@ class MySQLDeduplicator:
|
|
2917
2645
|
db_results = future.result()
|
2918
2646
|
all_results[db] = db_results
|
2919
2647
|
except Exception as e:
|
2920
|
-
|
2921
|
-
f"处理数据库 {db} 时出错: {str(e)}",
|
2922
|
-
{'error_type': type(e).__name__}
|
2923
|
-
)
|
2648
|
+
logger.error(f"处理数据库 {db} 时出错: {str(e)}", {'error_type': type(e).__name__})
|
2924
2649
|
all_results[db] = {}
|
2925
2650
|
else:
|
2926
2651
|
# 串行处理数据库
|
@@ -2942,18 +2667,15 @@ class MySQLDeduplicator:
|
|
2942
2667
|
for r in db.values()
|
2943
2668
|
)
|
2944
2669
|
|
2945
|
-
|
2946
|
-
|
2947
|
-
|
2948
|
-
|
2670
|
+
logger.info(
|
2671
|
+
f"所有数据库处理完成 - 共发现 {total_dup} 组重复数据,删除 {total_del} 行",
|
2672
|
+
{'total_results': all_results}
|
2673
|
+
)
|
2949
2674
|
|
2950
2675
|
return all_results
|
2951
2676
|
|
2952
2677
|
except Exception as e:
|
2953
|
-
|
2954
|
-
f"全局处理时发生错误: {str(e)}",
|
2955
|
-
{'error_type': type(e).__name__}
|
2956
|
-
)
|
2678
|
+
logger.error(f"全局处理时发生错误: {str(e)}", {'error_type': type(e).__name__})
|
2957
2679
|
return all_results
|
2958
2680
|
|
2959
2681
|
@_retry_on_failure
|
@@ -2985,12 +2707,9 @@ class MySQLDeduplicator:
|
|
2985
2707
|
try:
|
2986
2708
|
if hasattr(self, 'pool') and self.pool:
|
2987
2709
|
self.pool.close()
|
2988
|
-
|
2710
|
+
logger.info("数据库连接池已关闭")
|
2989
2711
|
except Exception as e:
|
2990
|
-
|
2991
|
-
f"关闭连接池时出错: {str(e)}",
|
2992
|
-
{'error_type': type(e).__name__}
|
2993
|
-
)
|
2712
|
+
logger.error(f"关闭连接池时出错: {str(e)}", {'error_type': type(e).__name__})
|
2994
2713
|
finally:
|
2995
2714
|
self.pool = None
|
2996
2715
|
|
@@ -3004,11 +2723,9 @@ class MySQLDeduplicator:
|
|
3004
2723
|
def main():
|
3005
2724
|
uploader = MySQLUploader(
|
3006
2725
|
username='root',
|
3007
|
-
password='
|
2726
|
+
password='pw',
|
3008
2727
|
host='localhost',
|
3009
2728
|
port=3306,
|
3010
|
-
logging_mode='console',
|
3011
|
-
log_level='info'
|
3012
2729
|
)
|
3013
2730
|
|
3014
2731
|
# 定义列和数据类型
|
@@ -3050,24 +2767,22 @@ def main():
|
|
3050
2767
|
def main2():
|
3051
2768
|
deduplicator = MySQLDeduplicator(
|
3052
2769
|
username='root',
|
3053
|
-
password='
|
2770
|
+
password='pw',
|
3054
2771
|
host='localhost',
|
3055
2772
|
port=3306
|
3056
2773
|
)
|
3057
2774
|
|
3058
|
-
#
|
3059
|
-
|
2775
|
+
# 全库去重(单线程)
|
2776
|
+
deduplicator.deduplicate_all()
|
3060
2777
|
|
3061
2778
|
# # 指定数据库去重(多线程)
|
3062
2779
|
# deduplicator.deduplicate_database('my_db', parallel=True)
|
3063
2780
|
|
3064
|
-
# 指定表去重(使用特定列)
|
3065
|
-
deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
|
2781
|
+
# # 指定表去重(使用特定列)
|
2782
|
+
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
|
3066
2783
|
|
3067
2784
|
# 关闭连接
|
3068
2785
|
deduplicator.close()
|
3069
2786
|
|
3070
2787
|
if __name__ == '__main__':
|
3071
2788
|
pass
|
3072
|
-
|
3073
|
-
main2()
|