mdbq 3.9.10__py3-none-any.whl → 3.9.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/mysql.py +1 -1669
- mdbq/mysql/uploader.py +30 -25
- {mdbq-3.9.10.dist-info → mdbq-3.9.11.dist-info}/METADATA +1 -1
- {mdbq-3.9.10.dist-info → mdbq-3.9.11.dist-info}/RECORD +7 -7
- {mdbq-3.9.10.dist-info → mdbq-3.9.11.dist-info}/WHEEL +0 -0
- {mdbq-3.9.10.dist-info → mdbq-3.9.11.dist-info}/top_level.txt +0 -0
mdbq/mysql/mysql.py
CHANGED
@@ -5,23 +5,12 @@ import time
|
|
5
5
|
from functools import wraps
|
6
6
|
import warnings
|
7
7
|
import pymysql
|
8
|
-
import numpy as np
|
9
8
|
import pandas as pd
|
10
9
|
from sqlalchemy import create_engine
|
11
10
|
import os
|
12
|
-
import logging
|
13
|
-
import logging.handlers
|
14
11
|
from mdbq.other import otk
|
15
12
|
from mdbq.log import mylogger
|
16
|
-
from typing import Union, List, Dict, Optional, Any, Tuple, Set
|
17
|
-
from dbutils.pooled_db import PooledDB
|
18
13
|
import json
|
19
|
-
import psutil
|
20
|
-
from collections import OrderedDict
|
21
|
-
import threading
|
22
|
-
import concurrent.futures
|
23
|
-
from collections import defaultdict
|
24
|
-
|
25
14
|
|
26
15
|
warnings.filterwarnings('ignore')
|
27
16
|
"""
|
@@ -187,7 +176,7 @@ class MysqlUpload:
|
|
187
176
|
if str(cut_data).lower() == 'year':
|
188
177
|
table_name = f'{table_name}_{__y}'
|
189
178
|
elif str(cut_data).lower() == 'month':
|
190
|
-
table_name = f'{table_name}
|
179
|
+
table_name = f'{table_name}-{__y_m}'
|
191
180
|
else:
|
192
181
|
logger.info(f'参数不正确,cut_data应为 year 或 month ')
|
193
182
|
except Exception as e:
|
@@ -1127,1662 +1116,5 @@ class OptimizeDatas:
|
|
1127
1116
|
self.connection.close()
|
1128
1117
|
|
1129
1118
|
|
1130
|
-
class StatementCache(OrderedDict):
|
1131
|
-
"""LRU缓存策略"""
|
1132
|
-
def __init__(self, maxsize=100):
|
1133
|
-
super().__init__()
|
1134
|
-
self.maxsize = maxsize
|
1135
|
-
|
1136
|
-
def __setitem__(self, key, value):
|
1137
|
-
super().__setitem__(key, value)
|
1138
|
-
if len(self) > self.maxsize:
|
1139
|
-
self.popitem(last=False)
|
1140
|
-
|
1141
|
-
|
1142
|
-
class MySQLUploader:
|
1143
|
-
def __init__(
|
1144
|
-
self,
|
1145
|
-
username: str,
|
1146
|
-
password: str,
|
1147
|
-
host: str = 'localhost',
|
1148
|
-
port: int = 3306,
|
1149
|
-
charset: str = 'utf8mb4',
|
1150
|
-
collation: str = 'utf8mb4_0900_ai_ci', # utf8mb4_0900_ai_ci: 该排序规则对大小写不敏感, utf8mb4_0900_as_cs/utf8mb4_bin: 对大小写敏感
|
1151
|
-
max_retries: int = 10,
|
1152
|
-
retry_interval: int = 10,
|
1153
|
-
pool_size: int = 5,
|
1154
|
-
connect_timeout: int = 10,
|
1155
|
-
read_timeout: int = 30,
|
1156
|
-
write_timeout: int = 30,
|
1157
|
-
ssl: Optional[Dict] = None
|
1158
|
-
):
|
1159
|
-
"""
|
1160
|
-
:param username: 数据库用户名
|
1161
|
-
:param password: 数据库密码
|
1162
|
-
:param host: 数据库主机地址,默认为localhost
|
1163
|
-
:param port: 数据库端口,默认为3306
|
1164
|
-
:param charset: 字符集,默认为utf8mb4
|
1165
|
-
:param collation: 排序规则,默认为utf8mb4_0900_ai_ci
|
1166
|
-
|
1167
|
-
:param max_retries: 最大重试次数,默认为10
|
1168
|
-
:param retry_interval: 重试间隔(秒),默认为10
|
1169
|
-
:param pool_size: 连接池大小,默认为5
|
1170
|
-
:param connect_timeout: 连接超时(秒),默认为10
|
1171
|
-
:param read_timeout: 读取超时(秒),默认为30
|
1172
|
-
:param write_timeout: 写入超时(秒),默认为30
|
1173
|
-
:param ssl: SSL配置字典,默认为None
|
1174
|
-
"""
|
1175
|
-
self.username = username
|
1176
|
-
self.password = password
|
1177
|
-
self.host = host
|
1178
|
-
self.port = port
|
1179
|
-
self.charset = charset
|
1180
|
-
self.collation = collation
|
1181
|
-
self.max_retries = max(max_retries, 1)
|
1182
|
-
self.retry_interval = max(retry_interval, 1)
|
1183
|
-
self.pool_size = max(pool_size, 1)
|
1184
|
-
self.connect_timeout = connect_timeout
|
1185
|
-
self.read_timeout = read_timeout
|
1186
|
-
self.write_timeout = write_timeout
|
1187
|
-
self.ssl = ssl
|
1188
|
-
self._prepared_statements = StatementCache(maxsize=100)
|
1189
|
-
self._max_cached_statements = 100
|
1190
|
-
self._table_metadata_cache = {}
|
1191
|
-
self.metadata_cache_ttl = 300 # 5分钟缓存时间
|
1192
|
-
|
1193
|
-
# 创建连接池
|
1194
|
-
self.pool = self._create_connection_pool()
|
1195
|
-
|
1196
|
-
def _create_connection_pool(self) -> PooledDB:
|
1197
|
-
"""创建数据库连接池"""
|
1198
|
-
if hasattr(self, 'pool') and self.pool is not None and self._check_pool_health():
|
1199
|
-
return self.pool
|
1200
|
-
|
1201
|
-
start_time = time.time()
|
1202
|
-
self.pool = None
|
1203
|
-
|
1204
|
-
pool_params = {
|
1205
|
-
'creator': pymysql,
|
1206
|
-
'host': self.host,
|
1207
|
-
'port': self.port,
|
1208
|
-
'user': self.username,
|
1209
|
-
'password': self.password,
|
1210
|
-
'charset': self.charset,
|
1211
|
-
'cursorclass': pymysql.cursors.DictCursor,
|
1212
|
-
'maxconnections': self.pool_size,
|
1213
|
-
'ping': 7,
|
1214
|
-
'connect_timeout': self.connect_timeout,
|
1215
|
-
'read_timeout': self.read_timeout,
|
1216
|
-
'write_timeout': self.write_timeout,
|
1217
|
-
'autocommit': False
|
1218
|
-
}
|
1219
|
-
|
1220
|
-
if self.ssl:
|
1221
|
-
required_keys = {'ca', 'cert', 'key'}
|
1222
|
-
if not all(k in self.ssl for k in required_keys):
|
1223
|
-
error_msg = "SSL配置必须包含ca、cert和key"
|
1224
|
-
logger.error(error_msg)
|
1225
|
-
raise ValueError(error_msg)
|
1226
|
-
pool_params['ssl'] = {
|
1227
|
-
'ca': self.ssl['ca'],
|
1228
|
-
'cert': self.ssl['cert'],
|
1229
|
-
'key': self.ssl['key'],
|
1230
|
-
'check_hostname': self.ssl.get('check_hostname', False)
|
1231
|
-
}
|
1232
|
-
|
1233
|
-
try:
|
1234
|
-
pool = PooledDB(**pool_params)
|
1235
|
-
elapsed = time.time() - start_time
|
1236
|
-
logger.info("连接池创建成功", {
|
1237
|
-
'pool_size': self.pool_size,
|
1238
|
-
'time_elapsed': elapsed
|
1239
|
-
})
|
1240
|
-
return pool
|
1241
|
-
except Exception as e:
|
1242
|
-
elapsed = time.time() - start_time
|
1243
|
-
self.pool = None
|
1244
|
-
logger.error("连接池创建失败", {
|
1245
|
-
'error': str(e),
|
1246
|
-
'time_elapsed': elapsed
|
1247
|
-
})
|
1248
|
-
raise ConnectionError(f"连接池创建失败: {str(e)}")
|
1249
|
-
|
1250
|
-
def _execute_with_retry(self, func):
|
1251
|
-
@wraps(func)
|
1252
|
-
def wrapper(*args, **kwargs):
|
1253
|
-
last_exception = None
|
1254
|
-
start_time = time.time()
|
1255
|
-
operation = func.__name__
|
1256
|
-
|
1257
|
-
logger.debug(f"开始执行操作: {operation}", {
|
1258
|
-
'attempt': 1,
|
1259
|
-
'max_retries': self.max_retries
|
1260
|
-
})
|
1261
|
-
|
1262
|
-
for attempt in range(self.max_retries):
|
1263
|
-
try:
|
1264
|
-
result = func(*args, **kwargs)
|
1265
|
-
elapsed = time.time() - start_time
|
1266
|
-
|
1267
|
-
if attempt > 0:
|
1268
|
-
logger.info("操作成功(重试后)", {
|
1269
|
-
'operation': operation,
|
1270
|
-
'attempts': attempt + 1,
|
1271
|
-
'time_elapsed': elapsed
|
1272
|
-
})
|
1273
|
-
else:
|
1274
|
-
logger.debug("操作成功", {
|
1275
|
-
'operation': operation,
|
1276
|
-
'time_elapsed': elapsed
|
1277
|
-
})
|
1278
|
-
|
1279
|
-
return result
|
1280
|
-
|
1281
|
-
except (pymysql.OperationalError, pymysql.err.MySQLError) as e:
|
1282
|
-
last_exception = e
|
1283
|
-
|
1284
|
-
# 记录详细的MySQL错误信息
|
1285
|
-
error_details = {
|
1286
|
-
'operation': operation,
|
1287
|
-
'error_code': e.args[0] if e.args else None,
|
1288
|
-
'error_message': e.args[1] if len(e.args) > 1 else None,
|
1289
|
-
'attempt': attempt + 1,
|
1290
|
-
'max_retries': self.max_retries
|
1291
|
-
}
|
1292
|
-
|
1293
|
-
if attempt < self.max_retries - 1:
|
1294
|
-
wait_time = self.retry_interval * (attempt + 1)
|
1295
|
-
error_details['wait_time'] = wait_time
|
1296
|
-
logger.warning(f"数据库操作失败,准备重试 {error_details}", )
|
1297
|
-
time.sleep(wait_time)
|
1298
|
-
|
1299
|
-
# 尝试重新连接
|
1300
|
-
try:
|
1301
|
-
self.pool = self._create_connection_pool()
|
1302
|
-
logger.info("成功重新建立数据库连接")
|
1303
|
-
except Exception as reconnect_error:
|
1304
|
-
logger.error("重连失败", {
|
1305
|
-
'error': str(reconnect_error)
|
1306
|
-
})
|
1307
|
-
else:
|
1308
|
-
elapsed = time.time() - start_time
|
1309
|
-
error_details['time_elapsed'] = elapsed
|
1310
|
-
logger.error(f"操作最终失败 {error_details}")
|
1311
|
-
|
1312
|
-
except pymysql.IntegrityError as e:
|
1313
|
-
elapsed = time.time() - start_time
|
1314
|
-
logger.error("完整性约束错误", {
|
1315
|
-
'operation': operation,
|
1316
|
-
'time_elapsed': elapsed,
|
1317
|
-
'error_code': e.args[0] if e.args else None,
|
1318
|
-
'error_message': e.args[1] if len(e.args) > 1 else None
|
1319
|
-
})
|
1320
|
-
raise e
|
1321
|
-
|
1322
|
-
except Exception as e:
|
1323
|
-
last_exception = e
|
1324
|
-
elapsed = time.time() - start_time
|
1325
|
-
logger.error("发生意外错误", {
|
1326
|
-
'operation': operation,
|
1327
|
-
'time_elapsed': elapsed,
|
1328
|
-
'error_type': type(e).__name__,
|
1329
|
-
'error_message': str(e),
|
1330
|
-
'error_args': e.args if hasattr(e, 'args') else None
|
1331
|
-
})
|
1332
|
-
break
|
1333
|
-
|
1334
|
-
raise last_exception if last_exception else Exception("发生未知错误")
|
1335
|
-
|
1336
|
-
return wrapper
|
1337
|
-
|
1338
|
-
def _get_connection(self):
|
1339
|
-
"""从连接池获取连接"""
|
1340
|
-
try:
|
1341
|
-
conn = self.pool.connection()
|
1342
|
-
logger.debug("获取数据库连接")
|
1343
|
-
return conn
|
1344
|
-
except Exception as e:
|
1345
|
-
logger.error(f'{e}')
|
1346
|
-
raise ConnectionError(f"连接数据库失败: {str(e)}")
|
1347
|
-
|
1348
|
-
def _check_database_exists(self, db_name: str) -> bool:
|
1349
|
-
"""检查数据库是否存在"""
|
1350
|
-
db_name = self._validate_identifier(db_name)
|
1351
|
-
sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
|
1352
|
-
|
1353
|
-
try:
|
1354
|
-
with self._get_connection() as conn:
|
1355
|
-
with conn.cursor() as cursor:
|
1356
|
-
cursor.execute(sql, (db_name,))
|
1357
|
-
exists = bool(cursor.fetchone())
|
1358
|
-
logger.debug(f"{db_name} 数据库已存在: {exists}")
|
1359
|
-
return exists
|
1360
|
-
except Exception as e:
|
1361
|
-
logger.error(f"检查数据库是否存在时出错: {str(e)}")
|
1362
|
-
raise
|
1363
|
-
|
1364
|
-
def _create_database(self, db_name: str):
|
1365
|
-
"""创建数据库"""
|
1366
|
-
db_name = self._validate_identifier(db_name)
|
1367
|
-
sql = f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET {self.charset} COLLATE {self.collation}"
|
1368
|
-
|
1369
|
-
try:
|
1370
|
-
with self._get_connection() as conn:
|
1371
|
-
with conn.cursor() as cursor:
|
1372
|
-
cursor.execute(sql)
|
1373
|
-
conn.commit()
|
1374
|
-
logger.info(f"{db_name} 数据库已创建")
|
1375
|
-
except Exception as e:
|
1376
|
-
logger.error(f"{db_name}: 无法创建数据库 {str(e)}")
|
1377
|
-
conn.rollback()
|
1378
|
-
raise
|
1379
|
-
|
1380
|
-
def _get_partition_table_name(self, table_name: str, date_value: str, partition_by: str) -> str:
|
1381
|
-
"""
|
1382
|
-
获取分表名称
|
1383
|
-
|
1384
|
-
:param table_name: 基础表名
|
1385
|
-
:param date_value: 日期值
|
1386
|
-
:param partition_by: 分表方式 ('year' 或 'month')
|
1387
|
-
:return: 分表名称
|
1388
|
-
:raises ValueError: 如果日期格式无效或分表方式无效
|
1389
|
-
"""
|
1390
|
-
try:
|
1391
|
-
# date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d %H:%M:%S')
|
1392
|
-
date_obj = self._validate_datetime(date_value, True)
|
1393
|
-
except ValueError:
|
1394
|
-
try:
|
1395
|
-
# date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d')
|
1396
|
-
date_obj = self._validate_datetime(date_value, True)
|
1397
|
-
except ValueError:
|
1398
|
-
error_msg = f"无效的日期格式1: {date_value}"
|
1399
|
-
logger.error(error_msg)
|
1400
|
-
raise ValueError(error_msg)
|
1401
|
-
|
1402
|
-
if partition_by == 'year':
|
1403
|
-
return f"{table_name}_{date_obj.year}"
|
1404
|
-
elif partition_by == 'month':
|
1405
|
-
return f"{table_name}_{date_obj.year}_{date_obj.month:02d}"
|
1406
|
-
else:
|
1407
|
-
error_msg = "partition_by must be 'year' or 'month'"
|
1408
|
-
logger.error(error_msg)
|
1409
|
-
raise ValueError(error_msg)
|
1410
|
-
|
1411
|
-
def _validate_identifier(self, identifier: str) -> str:
|
1412
|
-
"""
|
1413
|
-
验证并清理数据库标识符(数据库名、表名、列名)
|
1414
|
-
防止SQL注入和非法字符
|
1415
|
-
|
1416
|
-
:param identifier: 要验证的标识符
|
1417
|
-
:return: 清理后的安全标识符
|
1418
|
-
:raises ValueError: 如果标识符无效
|
1419
|
-
"""
|
1420
|
-
if not identifier or not isinstance(identifier, str):
|
1421
|
-
error_msg = f"无效的标识符: {identifier}"
|
1422
|
-
logger.error(error_msg)
|
1423
|
-
raise ValueError(error_msg)
|
1424
|
-
|
1425
|
-
# 移除非法字符,只保留字母、数字、下划线和美元符号
|
1426
|
-
cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '', identifier)
|
1427
|
-
if not cleaned:
|
1428
|
-
error_msg = f"无法清理异常标识符: {identifier}"
|
1429
|
-
logger.error(error_msg)
|
1430
|
-
raise ValueError(error_msg)
|
1431
|
-
|
1432
|
-
# 检查是否为MySQL保留字
|
1433
|
-
mysql_keywords = {
|
1434
|
-
'select', 'insert', 'update', 'delete', 'from', 'where', 'and', 'or',
|
1435
|
-
'not', 'like', 'in', 'is', 'null', 'true', 'false', 'between'
|
1436
|
-
}
|
1437
|
-
if cleaned.lower() in mysql_keywords:
|
1438
|
-
logger.debug(f"存在MySQL保留字: {cleaned}")
|
1439
|
-
return f"`{cleaned}`"
|
1440
|
-
|
1441
|
-
return cleaned
|
1442
|
-
|
1443
|
-
def _check_table_exists(self, db_name: str, table_name: str) -> bool:
|
1444
|
-
"""检查表是否存在"""
|
1445
|
-
cache_key = f"{db_name}.{table_name}"
|
1446
|
-
if cache_key in self._table_metadata_cache:
|
1447
|
-
cached_time, result = self._table_metadata_cache[cache_key]
|
1448
|
-
if time.time() - cached_time < self.metadata_cache_ttl:
|
1449
|
-
return result
|
1450
|
-
|
1451
|
-
db_name = self._validate_identifier(db_name)
|
1452
|
-
table_name = self._validate_identifier(table_name)
|
1453
|
-
sql = """
|
1454
|
-
SELECT TABLE_NAME
|
1455
|
-
FROM INFORMATION_SCHEMA.TABLES
|
1456
|
-
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
1457
|
-
"""
|
1458
|
-
|
1459
|
-
try:
|
1460
|
-
with self._get_connection() as conn:
|
1461
|
-
with conn.cursor() as cursor:
|
1462
|
-
cursor.execute(sql, (db_name, table_name))
|
1463
|
-
result = bool(cursor.fetchone())
|
1464
|
-
except Exception as e:
|
1465
|
-
logger.error(f"检查数据表是否存在时发生未知错误: {e}", )
|
1466
|
-
raise
|
1467
|
-
|
1468
|
-
# 执行查询并缓存结果
|
1469
|
-
self._table_metadata_cache[cache_key] = (time.time(), result)
|
1470
|
-
return result
|
1471
|
-
|
1472
|
-
def _create_table(
|
1473
|
-
self,
|
1474
|
-
db_name: str,
|
1475
|
-
table_name: str,
|
1476
|
-
set_typ: Dict[str, str],
|
1477
|
-
primary_keys: Optional[List[str]] = None,
|
1478
|
-
date_column: Optional[str] = None,
|
1479
|
-
indexes: Optional[List[str]] = None,
|
1480
|
-
allow_null: bool = False
|
1481
|
-
):
|
1482
|
-
"""
|
1483
|
-
创建数据表
|
1484
|
-
|
1485
|
-
:param db_name: 数据库名
|
1486
|
-
:param table_name: 表名
|
1487
|
-
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
1488
|
-
:param primary_keys: 主键列列表
|
1489
|
-
:param date_column: 日期列名,如果存在将设置为索引
|
1490
|
-
:param indexes: 需要创建索引的列列表
|
1491
|
-
"""
|
1492
|
-
db_name = self._validate_identifier(db_name)
|
1493
|
-
table_name = self._validate_identifier(table_name)
|
1494
|
-
|
1495
|
-
if not set_typ:
|
1496
|
-
error_msg = "No columns specified for table creation"
|
1497
|
-
logger.error(error_msg)
|
1498
|
-
raise ValueError(error_msg)
|
1499
|
-
|
1500
|
-
# 构建列定义SQL
|
1501
|
-
column_defs = ["`id` INT NOT NULL AUTO_INCREMENT"]
|
1502
|
-
|
1503
|
-
# 添加其他列定义
|
1504
|
-
for col_name, col_type in set_typ.items():
|
1505
|
-
# 跳过id列,因为已经在前面添加了
|
1506
|
-
if col_name.lower() == 'id':
|
1507
|
-
continue
|
1508
|
-
safe_col_name = self._validate_identifier(col_name)
|
1509
|
-
col_def = f"`{safe_col_name}` {col_type}"
|
1510
|
-
|
1511
|
-
# 根据allow_null决定是否添加NOT NULL约束
|
1512
|
-
if not allow_null and not col_type.lower().startswith('json'):
|
1513
|
-
col_def += " NOT NULL"
|
1514
|
-
|
1515
|
-
column_defs.append(col_def)
|
1516
|
-
|
1517
|
-
# 添加主键定义
|
1518
|
-
if primary_keys:
|
1519
|
-
# 确保id在主键中
|
1520
|
-
if 'id' not in [pk.lower() for pk in primary_keys]:
|
1521
|
-
primary_keys = ['id'] + primary_keys
|
1522
|
-
else:
|
1523
|
-
# 如果没有指定主键,则使用id作为主键
|
1524
|
-
primary_keys = ['id']
|
1525
|
-
|
1526
|
-
# 添加主键定义
|
1527
|
-
safe_primary_keys = [self._validate_identifier(pk) for pk in primary_keys]
|
1528
|
-
primary_key_sql = f", PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
|
1529
|
-
|
1530
|
-
# 构建完整SQL
|
1531
|
-
sql = f"""
|
1532
|
-
CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
|
1533
|
-
{','.join(column_defs)}
|
1534
|
-
{primary_key_sql}
|
1535
|
-
) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
|
1536
|
-
"""
|
1537
|
-
|
1538
|
-
try:
|
1539
|
-
with self._get_connection() as conn:
|
1540
|
-
with conn.cursor() as cursor:
|
1541
|
-
cursor.execute(sql)
|
1542
|
-
logger.info(f"{db_name}.{table_name}: 数据表已创建")
|
1543
|
-
|
1544
|
-
# 添加普通索引
|
1545
|
-
index_statements = []
|
1546
|
-
|
1547
|
-
# 日期列索引
|
1548
|
-
if date_column and date_column in set_typ:
|
1549
|
-
safe_date_col = self._validate_identifier(date_column)
|
1550
|
-
index_statements.append(
|
1551
|
-
f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_date_col}` (`{safe_date_col}`)"
|
1552
|
-
)
|
1553
|
-
|
1554
|
-
# 其他索引
|
1555
|
-
if indexes:
|
1556
|
-
for idx_col in indexes:
|
1557
|
-
if idx_col in set_typ:
|
1558
|
-
safe_idx_col = self._validate_identifier(idx_col)
|
1559
|
-
index_statements.append(
|
1560
|
-
f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)"
|
1561
|
-
)
|
1562
|
-
|
1563
|
-
# 执行所有索引创建语句
|
1564
|
-
if index_statements:
|
1565
|
-
with conn.cursor() as cursor:
|
1566
|
-
for stmt in index_statements:
|
1567
|
-
cursor.execute(stmt)
|
1568
|
-
logger.debug(f"Executed index statement: {stmt}", )
|
1569
|
-
|
1570
|
-
conn.commit()
|
1571
|
-
logger.info(f"{db_name}.{table_name}: 索引已添加")
|
1572
|
-
|
1573
|
-
except Exception as e:
|
1574
|
-
logger.error(f"{db_name}.{table_name}: 建表失败: {str(e)}")
|
1575
|
-
conn.rollback()
|
1576
|
-
raise
|
1577
|
-
|
1578
|
-
def _validate_datetime(self, value, date_type=False):
|
1579
|
-
"""date_type: 返回字符串类型或者日期类型"""
|
1580
|
-
formats = [
|
1581
|
-
'%Y-%m-%d %H:%M:%S',
|
1582
|
-
'%Y-%m-%d',
|
1583
|
-
'%Y/%m/%d %H:%M:%S',
|
1584
|
-
'%Y/%m/%d',
|
1585
|
-
'%Y%m%d',
|
1586
|
-
'%Y-%m-%dT%H:%M:%S',
|
1587
|
-
'%Y-%m-%d %H:%M:%S.%f',
|
1588
|
-
'%Y/%-m/%-d', # 2023/1/8
|
1589
|
-
'%Y-%m-%-d', # 2023-01-8
|
1590
|
-
'%Y-%-m-%-d' # 2023-1-8
|
1591
|
-
]
|
1592
|
-
for fmt in formats:
|
1593
|
-
try:
|
1594
|
-
if date_type:
|
1595
|
-
return pd.to_datetime(datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d'))
|
1596
|
-
else:
|
1597
|
-
return datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d %H:%M:%S')
|
1598
|
-
except ValueError:
|
1599
|
-
continue
|
1600
|
-
raise ValueError(f"无效的日期格式2: {value}")
|
1601
|
-
|
1602
|
-
def _validate_value(self, value: Any, column_type: str) -> Any:
|
1603
|
-
"""
|
1604
|
-
验证并清理数据值,根据列类型进行适当转换
|
1605
|
-
|
1606
|
-
:param value: 要验证的值
|
1607
|
-
:param column_type: 列的数据类型
|
1608
|
-
:return: 清理后的值
|
1609
|
-
:raises ValueError: 如果值转换失败
|
1610
|
-
"""
|
1611
|
-
if value is None:
|
1612
|
-
return None
|
1613
|
-
|
1614
|
-
try:
|
1615
|
-
column_type_lower = column_type.lower()
|
1616
|
-
|
1617
|
-
if 'int' in column_type_lower:
|
1618
|
-
if isinstance(value, (str, bytes)) and not value.strip().isdigit():
|
1619
|
-
raise ValueError("非数字字符串无法转换为整数")
|
1620
|
-
return int(value)
|
1621
|
-
elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
|
1622
|
-
return float(value) if value is not None else None
|
1623
|
-
elif '日期' in column_type_lower or 'time' in column_type_lower:
|
1624
|
-
if isinstance(value, (datetime.datetime, pd.Timestamp)):
|
1625
|
-
return value.strftime('%Y-%m-%d %H:%M:%S')
|
1626
|
-
elif isinstance(value, str):
|
1627
|
-
try:
|
1628
|
-
return self._validate_datetime(value) # 使用专门的日期验证方法
|
1629
|
-
except ValueError as e:
|
1630
|
-
raise ValueError(f"无效日期格式: {value} - {str(e)}")
|
1631
|
-
return str(value)
|
1632
|
-
elif 'char' in column_type_lower or 'text' in column_type_lower:
|
1633
|
-
# 防止SQL注入
|
1634
|
-
if isinstance(value, str):
|
1635
|
-
return value.replace('\\', '\\\\').replace("'", "\\'")
|
1636
|
-
return str(value)
|
1637
|
-
elif 'json' in column_type_lower:
|
1638
|
-
import json
|
1639
|
-
return json.dumps(value) if value is not None else None
|
1640
|
-
else:
|
1641
|
-
return value
|
1642
|
-
except (ValueError, TypeError) as e:
|
1643
|
-
error_msg = f"数据类型转换异常 {value} to type {column_type}: {str(e)}"
|
1644
|
-
logger.error(error_msg)
|
1645
|
-
raise ValueError(error_msg)
|
1646
|
-
|
1647
|
-
def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
|
1648
|
-
"""获取表的列名和数据类型"""
|
1649
|
-
db_name = self._validate_identifier(db_name)
|
1650
|
-
table_name = self._validate_identifier(table_name)
|
1651
|
-
sql = """
|
1652
|
-
SELECT COLUMN_NAME, DATA_TYPE
|
1653
|
-
FROM INFORMATION_SCHEMA.COLUMNS
|
1654
|
-
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
1655
|
-
ORDER BY ORDINAL_POSITION
|
1656
|
-
"""
|
1657
|
-
|
1658
|
-
try:
|
1659
|
-
with self._get_connection() as conn:
|
1660
|
-
with conn.cursor() as cursor:
|
1661
|
-
cursor.execute(sql, (db_name, table_name))
|
1662
|
-
set_typ = {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
|
1663
|
-
logger.debug(f"{db_name}.{table_name}: 获取表的列信息: {set_typ}")
|
1664
|
-
return set_typ
|
1665
|
-
except Exception as e:
|
1666
|
-
logger.error(f"无法获取表列信息: {str(e)}")
|
1667
|
-
raise
|
1668
|
-
|
1669
|
-
def _upload_to_table(
|
1670
|
-
self,
|
1671
|
-
db_name: str,
|
1672
|
-
table_name: str,
|
1673
|
-
data: List[Dict],
|
1674
|
-
set_typ: Dict[str, str],
|
1675
|
-
primary_keys: Optional[List[str]],
|
1676
|
-
check_duplicate: bool,
|
1677
|
-
duplicate_columns: Optional[List[str]],
|
1678
|
-
allow_null: bool,
|
1679
|
-
auto_create: bool,
|
1680
|
-
date_column: Optional[str],
|
1681
|
-
indexes: Optional[List[str]],
|
1682
|
-
batch_id: Optional[str] = None
|
1683
|
-
):
|
1684
|
-
"""实际执行表上传的方法"""
|
1685
|
-
# 检查表是否存在
|
1686
|
-
if not self._check_table_exists(db_name, table_name):
|
1687
|
-
if auto_create:
|
1688
|
-
self._create_table(db_name, table_name, set_typ, primary_keys, date_column, indexes,
|
1689
|
-
allow_null=allow_null)
|
1690
|
-
else:
|
1691
|
-
error_msg = f"数据表不存在: '{db_name}.{table_name}'"
|
1692
|
-
logger.error(error_msg)
|
1693
|
-
raise ValueError(error_msg)
|
1694
|
-
|
1695
|
-
# 获取表结构并验证
|
1696
|
-
table_columns = self._get_table_columns(db_name, table_name)
|
1697
|
-
if not table_columns:
|
1698
|
-
error_msg = f"获取列失败 '{db_name}.{table_name}'"
|
1699
|
-
logger.error(error_msg)
|
1700
|
-
raise ValueError(error_msg)
|
1701
|
-
|
1702
|
-
# 验证数据列与表列匹配
|
1703
|
-
for col in set_typ:
|
1704
|
-
if col not in table_columns:
|
1705
|
-
error_msg = f"列不存在: '{col}' -> '{db_name}.{table_name}'"
|
1706
|
-
logger.error(error_msg)
|
1707
|
-
raise ValueError(error_msg)
|
1708
|
-
|
1709
|
-
# 插入数据
|
1710
|
-
self._insert_data(
|
1711
|
-
db_name, table_name, data, set_typ,
|
1712
|
-
check_duplicate, duplicate_columns
|
1713
|
-
)
|
1714
|
-
|
1715
|
-
def _infer_data_type(self, value: Any) -> str:
|
1716
|
-
"""
|
1717
|
-
根据值推断合适的数据类型
|
1718
|
-
|
1719
|
-
:param value: 要推断的值
|
1720
|
-
:return: MySQL数据类型字符串
|
1721
|
-
"""
|
1722
|
-
if value is None:
|
1723
|
-
return 'VARCHAR(255)' # 默认字符串类型
|
1724
|
-
|
1725
|
-
if isinstance(value, bool):
|
1726
|
-
return 'TINYINT(1)'
|
1727
|
-
elif isinstance(value, int):
|
1728
|
-
# if -128 <= value <= 127:
|
1729
|
-
# return 'TINYINT'
|
1730
|
-
# elif -32768 <= value <= 32767:
|
1731
|
-
# return 'SMALLINT'
|
1732
|
-
# elif -8388608 <= value <= 8388607:
|
1733
|
-
# return 'MEDIUMINT'
|
1734
|
-
if -2147483648 <= value <= 2147483647:
|
1735
|
-
return 'INT'
|
1736
|
-
else:
|
1737
|
-
return 'BIGINT'
|
1738
|
-
elif isinstance(value, float):
|
1739
|
-
return 'DECIMAL(10,2)'
|
1740
|
-
elif isinstance(value, (datetime.datetime, pd.Timestamp)):
|
1741
|
-
return 'DATETIME'
|
1742
|
-
elif isinstance(value, datetime.date):
|
1743
|
-
return 'DATE'
|
1744
|
-
elif isinstance(value, (list, dict)):
|
1745
|
-
return 'JSON'
|
1746
|
-
elif isinstance(value, str):
|
1747
|
-
# 尝试判断是否是日期时间
|
1748
|
-
try:
|
1749
|
-
self._validate_datetime(value)
|
1750
|
-
return 'DATETIME'
|
1751
|
-
except ValueError:
|
1752
|
-
pass
|
1753
|
-
|
1754
|
-
# 根据字符串长度选择合适类型
|
1755
|
-
length = len(value)
|
1756
|
-
if length <= 255:
|
1757
|
-
return 'VARCHAR(255)'
|
1758
|
-
elif length <= 65535:
|
1759
|
-
return 'TEXT'
|
1760
|
-
elif length <= 16777215:
|
1761
|
-
return 'MEDIUMTEXT'
|
1762
|
-
else:
|
1763
|
-
return 'LONGTEXT'
|
1764
|
-
else:
|
1765
|
-
return 'VARCHAR(255)'
|
1766
|
-
|
1767
|
-
def _prepare_data(
|
1768
|
-
self,
|
1769
|
-
data: Union[Dict, List[Dict], pd.DataFrame],
|
1770
|
-
set_typ: Dict[str, str],
|
1771
|
-
allow_null: bool = False
|
1772
|
-
) -> List[Dict]:
|
1773
|
-
"""
|
1774
|
-
准备要上传的数据,验证并转换数据类型
|
1775
|
-
|
1776
|
-
:param data: 输入数据
|
1777
|
-
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
1778
|
-
:param allow_null: 是否允许空值
|
1779
|
-
:return: 待上传的数据列表和对应的数据类型
|
1780
|
-
:raises ValueError: 如果数据验证失败
|
1781
|
-
"""
|
1782
|
-
# 统一数据格式为字典列表
|
1783
|
-
if isinstance(data, pd.DataFrame):
|
1784
|
-
try:
|
1785
|
-
# 将列名转为小写
|
1786
|
-
data.columns = [col.lower() for col in data.columns]
|
1787
|
-
data = data.replace({pd.NA: None}).to_dict('records')
|
1788
|
-
except Exception as e:
|
1789
|
-
logger.error(f"数据转字典时发生错误: {e}", )
|
1790
|
-
raise ValueError(f"数据转字典时发生错误: {e}")
|
1791
|
-
elif isinstance(data, dict):
|
1792
|
-
data = [{k.lower(): v for k, v in data.items()}]
|
1793
|
-
elif isinstance(data, list) and all(isinstance(item, dict) for item in data):
|
1794
|
-
# 将列表中的每个字典键转为小写
|
1795
|
-
data = [{k.lower(): v for k, v in item.items()} for item in data]
|
1796
|
-
else:
|
1797
|
-
error_msg = "数据结构必须是字典、列表、字典列表或dataframe"
|
1798
|
-
logger.error(error_msg)
|
1799
|
-
raise ValueError(error_msg)
|
1800
|
-
|
1801
|
-
# 将set_typ的键转为小写
|
1802
|
-
set_typ = {k.lower(): v for k, v in set_typ.items()}
|
1803
|
-
|
1804
|
-
# 获取数据中实际存在的列名
|
1805
|
-
data_columns = set()
|
1806
|
-
if data:
|
1807
|
-
data_columns = set(data[0].keys())
|
1808
|
-
|
1809
|
-
# 过滤set_typ,只保留数据中存在的列
|
1810
|
-
filtered_set_typ = {}
|
1811
|
-
for col in data_columns:
|
1812
|
-
if col in set_typ:
|
1813
|
-
filtered_set_typ[col] = set_typ[col]
|
1814
|
-
else:
|
1815
|
-
# 如果列不在set_typ中,尝试推断类型
|
1816
|
-
sample_values = [row[col] for row in data if col in row and row[col] is not None][:10]
|
1817
|
-
if sample_values:
|
1818
|
-
inferred_type = self._infer_data_type(sample_values[0])
|
1819
|
-
filtered_set_typ[col] = inferred_type
|
1820
|
-
logger.debug(f"自动推断列'{col}'的数据类型为: {inferred_type}")
|
1821
|
-
else:
|
1822
|
-
# 没有样本值,使用默认类型
|
1823
|
-
filtered_set_typ[col] = 'VARCHAR(255)'
|
1824
|
-
logger.debug(f"为列'{col}'使用默认数据类型: VARCHAR(255)")
|
1825
|
-
|
1826
|
-
prepared_data = []
|
1827
|
-
for row_idx, row in enumerate(data, 1):
|
1828
|
-
prepared_row = {}
|
1829
|
-
for col_name in filtered_set_typ:
|
1830
|
-
# 跳过id列,不允许外部传入id
|
1831
|
-
if col_name.lower() == 'id':
|
1832
|
-
continue
|
1833
|
-
|
1834
|
-
if col_name not in row:
|
1835
|
-
if not allow_null:
|
1836
|
-
error_msg = f"Row {row_idx}: Missing required column '{col_name}' in data"
|
1837
|
-
logger.error(error_msg)
|
1838
|
-
raise ValueError(error_msg)
|
1839
|
-
prepared_row[col_name] = None
|
1840
|
-
else:
|
1841
|
-
try:
|
1842
|
-
prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name])
|
1843
|
-
except ValueError as e:
|
1844
|
-
error_msg = f"Row {row_idx}, column '{col_name}': {str(e)}"
|
1845
|
-
logger.error(error_msg)
|
1846
|
-
raise ValueError(error_msg)
|
1847
|
-
prepared_data.append(prepared_row)
|
1848
|
-
|
1849
|
-
logger.debug(f"已准备 {len(prepared_data)} 行数据")
|
1850
|
-
return prepared_data, filtered_set_typ
|
1851
|
-
|
1852
|
-
def upload_data(
|
1853
|
-
self,
|
1854
|
-
db_name: str,
|
1855
|
-
table_name: str,
|
1856
|
-
data: Union[Dict, List[Dict], pd.DataFrame],
|
1857
|
-
set_typ: Dict[str, str],
|
1858
|
-
primary_keys: Optional[List[str]] = None,
|
1859
|
-
check_duplicate: bool = False,
|
1860
|
-
duplicate_columns: Optional[List[str]] = None,
|
1861
|
-
allow_null: bool = False,
|
1862
|
-
partition_by: Optional[str] = None,
|
1863
|
-
partition_date_column: str = '日期',
|
1864
|
-
auto_create: bool = True,
|
1865
|
-
indexes: Optional[List[str]] = None
|
1866
|
-
):
|
1867
|
-
"""
|
1868
|
-
上传数据到数据库
|
1869
|
-
"""
|
1870
|
-
upload_start = time.time()
|
1871
|
-
initial_row_count = len(data) if hasattr(data, '__len__') else 1
|
1872
|
-
|
1873
|
-
batch_id = f"batch_{int(time.time() * 1000)}"
|
1874
|
-
success_flag = False
|
1875
|
-
|
1876
|
-
logger.info("开始上传数据", {
|
1877
|
-
'batch_id': batch_id,
|
1878
|
-
'database': db_name,
|
1879
|
-
'table': table_name,
|
1880
|
-
'partition_by': partition_by,
|
1881
|
-
'check_duplicate': check_duplicate,
|
1882
|
-
'row_count': len(data) if hasattr(data, '__len__') else 1,
|
1883
|
-
'auto_create': auto_create
|
1884
|
-
})
|
1885
|
-
|
1886
|
-
try:
|
1887
|
-
# 验证参数
|
1888
|
-
if not set_typ:
|
1889
|
-
error_msg = "列的数据类型缺失"
|
1890
|
-
logger.error(error_msg)
|
1891
|
-
raise ValueError(error_msg)
|
1892
|
-
|
1893
|
-
if partition_by and partition_by not in ['year', 'month']:
|
1894
|
-
error_msg = "分表方式必须是 'year' 或 'month'"
|
1895
|
-
logger.error(error_msg)
|
1896
|
-
raise ValueError(error_msg)
|
1897
|
-
|
1898
|
-
# 准备数据
|
1899
|
-
prepared_data, set_typ = self._prepare_data(data, set_typ, allow_null)
|
1900
|
-
|
1901
|
-
# 检查数据库是否存在
|
1902
|
-
if not self._check_database_exists(db_name):
|
1903
|
-
if auto_create:
|
1904
|
-
self._create_database(db_name)
|
1905
|
-
else:
|
1906
|
-
error_msg = f"数据库不存在: '{db_name}'"
|
1907
|
-
logger.error(error_msg)
|
1908
|
-
raise ValueError(error_msg)
|
1909
|
-
|
1910
|
-
# 处理分表逻辑
|
1911
|
-
if partition_by:
|
1912
|
-
partitioned_data = {}
|
1913
|
-
for row in prepared_data:
|
1914
|
-
try:
|
1915
|
-
if partition_date_column not in row:
|
1916
|
-
error_msg = f"异常缺失列 '{partition_date_column}'"
|
1917
|
-
logger.error(error_msg)
|
1918
|
-
continue # 跳过当前行
|
1919
|
-
|
1920
|
-
part_table = self._get_partition_table_name(
|
1921
|
-
table_name,
|
1922
|
-
str(row[partition_date_column]),
|
1923
|
-
partition_by
|
1924
|
-
)
|
1925
|
-
if part_table not in partitioned_data:
|
1926
|
-
partitioned_data[part_table] = []
|
1927
|
-
partitioned_data[part_table].append(row)
|
1928
|
-
except Exception as e:
|
1929
|
-
logger.error("分表处理失败", {
|
1930
|
-
'row_data': row,
|
1931
|
-
'error': str(e)
|
1932
|
-
})
|
1933
|
-
continue # 跳过当前行
|
1934
|
-
|
1935
|
-
# 对每个分表执行上传
|
1936
|
-
for part_table, part_data in partitioned_data.items():
|
1937
|
-
try:
|
1938
|
-
self._upload_to_table(
|
1939
|
-
db_name, part_table, part_data, set_typ,
|
1940
|
-
primary_keys, check_duplicate, duplicate_columns,
|
1941
|
-
allow_null, auto_create, partition_date_column,
|
1942
|
-
indexes, batch_id
|
1943
|
-
)
|
1944
|
-
except Exception as e:
|
1945
|
-
logger.error("分表上传失败", {
|
1946
|
-
'partition_table': part_table,
|
1947
|
-
'error': str(e)
|
1948
|
-
})
|
1949
|
-
continue # 跳过当前分表,继续处理其他分表
|
1950
|
-
else:
|
1951
|
-
# 不分表,直接上传
|
1952
|
-
self._upload_to_table(
|
1953
|
-
db_name, table_name, prepared_data, set_typ,
|
1954
|
-
primary_keys, check_duplicate, duplicate_columns,
|
1955
|
-
allow_null, auto_create, partition_date_column,
|
1956
|
-
indexes, batch_id
|
1957
|
-
)
|
1958
|
-
|
1959
|
-
success_flag = True
|
1960
|
-
|
1961
|
-
except Exception as e:
|
1962
|
-
logger.error("上传过程中发生全局错误", {
|
1963
|
-
'error': str(e),
|
1964
|
-
'error_type': type(e).__name__
|
1965
|
-
})
|
1966
|
-
finally:
|
1967
|
-
elapsed = time.time() - upload_start
|
1968
|
-
logger.info("上传处理完成", {
|
1969
|
-
'batch_id': batch_id,
|
1970
|
-
'success': success_flag,
|
1971
|
-
'time_elapsed': elapsed,
|
1972
|
-
'initial_row_count': initial_row_count
|
1973
|
-
})
|
1974
|
-
|
1975
|
-
def _insert_data(
|
1976
|
-
self,
|
1977
|
-
db_name: str,
|
1978
|
-
table_name: str,
|
1979
|
-
data: List[Dict],
|
1980
|
-
set_typ: Dict[str, str],
|
1981
|
-
check_duplicate: bool = False,
|
1982
|
-
duplicate_columns: Optional[List[str]] = None,
|
1983
|
-
batch_size: int = 1000,
|
1984
|
-
batch_id: Optional[str] = None
|
1985
|
-
):
|
1986
|
-
"""
|
1987
|
-
插入数据到表中
|
1988
|
-
|
1989
|
-
参数:
|
1990
|
-
db_name: 数据库名
|
1991
|
-
table_name: 表名
|
1992
|
-
data: 要插入的数据列表
|
1993
|
-
set_typ: 列名和数据类型字典 {列名: 数据类型}
|
1994
|
-
check_duplicate: 是否检查重复
|
1995
|
-
duplicate_columns: 用于检查重复的列(为空时检查所有列)
|
1996
|
-
batch_size: 批量插入大小
|
1997
|
-
batch_id: 批次ID用于日志追踪
|
1998
|
-
"""
|
1999
|
-
if not data:
|
2000
|
-
return
|
2001
|
-
|
2002
|
-
# 获取所有列名(排除id列)
|
2003
|
-
all_columns = [col for col in set_typ.keys() if col.lower() != 'id']
|
2004
|
-
safe_columns = [self._validate_identifier(col) for col in all_columns]
|
2005
|
-
placeholders = ','.join(['%s'] * len(safe_columns))
|
2006
|
-
|
2007
|
-
# 构建基础SQL语句
|
2008
|
-
if check_duplicate:
|
2009
|
-
if not duplicate_columns:
|
2010
|
-
duplicate_columns = all_columns
|
2011
|
-
else:
|
2012
|
-
duplicate_columns = [col for col in duplicate_columns if col != 'id']
|
2013
|
-
|
2014
|
-
conditions = []
|
2015
|
-
for col in duplicate_columns:
|
2016
|
-
col_type = set_typ.get(col, '').lower()
|
2017
|
-
|
2018
|
-
# 处理DECIMAL类型,使用ROUND确保精度一致
|
2019
|
-
if col_type.startswith('decimal'):
|
2020
|
-
# 提取小数位数,如DECIMAL(10,2)提取2
|
2021
|
-
scale_match = re.search(r'decimal\(\d+,(\d+)\)', col_type)
|
2022
|
-
scale = int(scale_match.group(1)) if scale_match else 2
|
2023
|
-
conditions.append(f"ROUND(`{self._validate_identifier(col)}`, {scale}) = ROUND(%s, {scale})")
|
2024
|
-
else:
|
2025
|
-
conditions.append(f"`{self._validate_identifier(col)}` = %s")
|
2026
|
-
|
2027
|
-
where_clause = " AND ".join(conditions)
|
2028
|
-
|
2029
|
-
sql = f"""
|
2030
|
-
INSERT INTO `{db_name}`.`{table_name}`
|
2031
|
-
(`{'`,`'.join(safe_columns)}`)
|
2032
|
-
SELECT {placeholders}
|
2033
|
-
FROM DUAL
|
2034
|
-
WHERE NOT EXISTS (
|
2035
|
-
SELECT 1 FROM `{db_name}`.`{table_name}`
|
2036
|
-
WHERE {where_clause}
|
2037
|
-
)
|
2038
|
-
"""
|
2039
|
-
else:
|
2040
|
-
sql = f"""
|
2041
|
-
INSERT INTO `{db_name}`.`{table_name}`
|
2042
|
-
(`{'`,`'.join(safe_columns)}`)
|
2043
|
-
VALUES ({placeholders})
|
2044
|
-
"""
|
2045
|
-
|
2046
|
-
total_inserted = 0
|
2047
|
-
total_skipped = 0
|
2048
|
-
total_failed = 0 # 失败计数器
|
2049
|
-
|
2050
|
-
# 分批插入数据
|
2051
|
-
with self._get_connection() as conn:
|
2052
|
-
with conn.cursor() as cursor:
|
2053
|
-
for i in range(0, len(data), batch_size):
|
2054
|
-
batch_start = time.time()
|
2055
|
-
batch = data[i:i + batch_size]
|
2056
|
-
successful_rows = 0 # 当前批次成功数
|
2057
|
-
|
2058
|
-
for row in batch:
|
2059
|
-
try:
|
2060
|
-
# 准备参数
|
2061
|
-
row_values = [row.get(col) for col in all_columns]
|
2062
|
-
# 如果是排重检查,添加排重列值
|
2063
|
-
if check_duplicate:
|
2064
|
-
row_values += [row.get(col) for col in duplicate_columns]
|
2065
|
-
|
2066
|
-
cursor.execute(sql, row_values)
|
2067
|
-
successful_rows += 1
|
2068
|
-
conn.commit() # 每次成功插入后提交
|
2069
|
-
|
2070
|
-
except Exception as e:
|
2071
|
-
conn.rollback() # 回滚当前行的事务
|
2072
|
-
total_failed += 1
|
2073
|
-
|
2074
|
-
# 记录失败行详细信息
|
2075
|
-
error_details = {
|
2076
|
-
'batch_id': batch_id,
|
2077
|
-
'database': db_name,
|
2078
|
-
'table': table_name,
|
2079
|
-
'error_type': type(e).__name__,
|
2080
|
-
'error_message': str(e),
|
2081
|
-
'column_types': set_typ,
|
2082
|
-
'duplicate_check': check_duplicate,
|
2083
|
-
'duplicate_columns': duplicate_columns
|
2084
|
-
}
|
2085
|
-
logger.error(f"单行插入失败: {error_details}")
|
2086
|
-
continue # 跳过当前行,继续处理下一行
|
2087
|
-
|
2088
|
-
# 更新统计信息
|
2089
|
-
if check_duplicate:
|
2090
|
-
cursor.execute("SELECT ROW_COUNT()")
|
2091
|
-
affected_rows = cursor.rowcount
|
2092
|
-
total_inserted += affected_rows
|
2093
|
-
total_skipped += len(batch) - affected_rows - (len(batch) - successful_rows)
|
2094
|
-
else:
|
2095
|
-
total_inserted += successful_rows
|
2096
|
-
|
2097
|
-
batch_elapsed = time.time() - batch_start
|
2098
|
-
batch_info = {
|
2099
|
-
'batch_id': batch_id,
|
2100
|
-
'batch_index': i // batch_size + 1,
|
2101
|
-
'total_batches': (len(data) + batch_size - 1) // batch_size,
|
2102
|
-
'batch_size': len(batch),
|
2103
|
-
'successful_rows': successful_rows,
|
2104
|
-
'failed_rows': len(batch) - successful_rows,
|
2105
|
-
'time_elapsed': batch_elapsed,
|
2106
|
-
'rows_per_second': successful_rows / batch_elapsed if batch_elapsed > 0 else 0
|
2107
|
-
}
|
2108
|
-
logger.debug(f"批次处理完成 {batch_info}")
|
2109
|
-
|
2110
|
-
logger.info("数据插入完成", {
|
2111
|
-
'total_rows': len(data),
|
2112
|
-
'inserted_rows': total_inserted,
|
2113
|
-
'skipped_rows': total_skipped,
|
2114
|
-
'failed_rows': total_failed
|
2115
|
-
})
|
2116
|
-
|
2117
|
-
def close(self):
|
2118
|
-
"""关闭连接池并记录最终指标"""
|
2119
|
-
close_start = time.time()
|
2120
|
-
|
2121
|
-
try:
|
2122
|
-
if hasattr(self, 'pool') and self.pool is not None:
|
2123
|
-
# 更安全的关闭方式
|
2124
|
-
try:
|
2125
|
-
self.pool.close()
|
2126
|
-
except Exception as e:
|
2127
|
-
logger.warning("关闭连接池时出错", {
|
2128
|
-
'error': str(e)
|
2129
|
-
})
|
2130
|
-
|
2131
|
-
self.pool = None
|
2132
|
-
|
2133
|
-
elapsed = round(time.time() - close_start, 2)
|
2134
|
-
logger.info("连接池已关闭", {
|
2135
|
-
'close_time_elapsed': elapsed
|
2136
|
-
})
|
2137
|
-
except Exception as e:
|
2138
|
-
elapsed = round(time.time() - close_start, 2)
|
2139
|
-
logger.error("关闭连接池失败", {
|
2140
|
-
'error': str(e),
|
2141
|
-
'close_time_elapsed': elapsed
|
2142
|
-
})
|
2143
|
-
raise
|
2144
|
-
|
2145
|
-
def _check_pool_health(self):
|
2146
|
-
"""定期检查连接池健康状态"""
|
2147
|
-
try:
|
2148
|
-
conn = self.pool.connection()
|
2149
|
-
conn.ping(reconnect=True)
|
2150
|
-
conn.close()
|
2151
|
-
return True
|
2152
|
-
except Exception as e:
|
2153
|
-
logger.warning("连接池健康检查失败", {
|
2154
|
-
'error': str(e)
|
2155
|
-
})
|
2156
|
-
return False
|
2157
|
-
|
2158
|
-
def retry_on_failure(max_retries=3, delay=1):
|
2159
|
-
def decorator(func):
|
2160
|
-
@wraps(func)
|
2161
|
-
def wrapper(*args, **kwargs):
|
2162
|
-
last_exception = None
|
2163
|
-
for attempt in range(max_retries):
|
2164
|
-
try:
|
2165
|
-
return func(*args, **kwargs)
|
2166
|
-
except (pymysql.OperationalError, pymysql.InterfaceError) as e:
|
2167
|
-
last_exception = e
|
2168
|
-
if attempt < max_retries - 1:
|
2169
|
-
time.sleep(delay * (attempt + 1))
|
2170
|
-
continue
|
2171
|
-
raise MySQLUploaderError(f"操作重试{max_retries}次后失败") from e
|
2172
|
-
except Exception as e:
|
2173
|
-
raise MySQLUploaderError(f"操作失败: {str(e)}") from e
|
2174
|
-
raise last_exception if last_exception else MySQLUploaderError("未知错误")
|
2175
|
-
|
2176
|
-
return wrapper
|
2177
|
-
|
2178
|
-
return decorator
|
2179
|
-
|
2180
|
-
|
2181
|
-
class MySQLDeduplicator:
|
2182
|
-
"""
|
2183
|
-
MySQL数据去重
|
2184
|
-
|
2185
|
-
功能:
|
2186
|
-
1. 自动检测并删除MySQL数据库中的重复数据
|
2187
|
-
2. 支持全库扫描或指定表处理
|
2188
|
-
3. 支持多线程/多进程安全处理
|
2189
|
-
4. 完善的错误处理和日志记录
|
2190
|
-
|
2191
|
-
使用示例:
|
2192
|
-
deduplicator = MySQLDeduplicator(
|
2193
|
-
username='root',
|
2194
|
-
password='password',
|
2195
|
-
host='localhost',
|
2196
|
-
port=3306
|
2197
|
-
)
|
2198
|
-
|
2199
|
-
# 全库去重
|
2200
|
-
deduplicator.deduplicate_all()
|
2201
|
-
|
2202
|
-
# 指定数据库去重(多线程)
|
2203
|
-
deduplicator.deduplicate_database('my_db', parallel=True)
|
2204
|
-
|
2205
|
-
# 指定表去重(使用特定列)
|
2206
|
-
deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
|
2207
|
-
|
2208
|
-
# 关闭连接
|
2209
|
-
deduplicator.close()
|
2210
|
-
"""
|
2211
|
-
|
2212
|
-
def __init__(
|
2213
|
-
self,
|
2214
|
-
username: str,
|
2215
|
-
password: str,
|
2216
|
-
host: str = 'localhost',
|
2217
|
-
port: int = 3306,
|
2218
|
-
charset: str = 'utf8mb4',
|
2219
|
-
max_workers: int = 1,
|
2220
|
-
batch_size: int = 1000,
|
2221
|
-
skip_system_dbs: bool = True,
|
2222
|
-
max_retries: int = 3,
|
2223
|
-
retry_interval: int = 5,
|
2224
|
-
pool_size: int = 5
|
2225
|
-
):
|
2226
|
-
"""
|
2227
|
-
初始化去重处理器
|
2228
|
-
|
2229
|
-
:param username: 数据库用户名
|
2230
|
-
:param password: 数据库密码
|
2231
|
-
:param host: 数据库主机,默认为localhost
|
2232
|
-
:param port: 数据库端口,默认为3306
|
2233
|
-
:param charset: 字符集,默认为utf8mb4
|
2234
|
-
:param max_workers: 最大工作线程数,默认为1(单线程)
|
2235
|
-
:param batch_size: 批量处理大小,默认为1000
|
2236
|
-
:param skip_system_dbs: 是否跳过系统数据库,默认为True
|
2237
|
-
:param max_retries: 最大重试次数
|
2238
|
-
:param retry_interval: 重试间隔(秒)
|
2239
|
-
:param pool_size: 连接池大小
|
2240
|
-
"""
|
2241
|
-
# 初始化连接池
|
2242
|
-
self.pool = PooledDB(
|
2243
|
-
creator=pymysql,
|
2244
|
-
host=host,
|
2245
|
-
port=port,
|
2246
|
-
user=username,
|
2247
|
-
password=password,
|
2248
|
-
charset=charset,
|
2249
|
-
maxconnections=pool_size,
|
2250
|
-
cursorclass=pymysql.cursors.DictCursor
|
2251
|
-
)
|
2252
|
-
|
2253
|
-
# 配置参数
|
2254
|
-
self.max_workers = max(1, min(max_workers, 20)) # 限制最大线程数
|
2255
|
-
self.batch_size = batch_size
|
2256
|
-
self.skip_system_dbs = skip_system_dbs
|
2257
|
-
self.max_retries = max_retries
|
2258
|
-
self.retry_interval = retry_interval
|
2259
|
-
|
2260
|
-
# 线程安全控制
|
2261
|
-
self._lock = threading.Lock()
|
2262
|
-
self._processing_tables = set() # 正在处理的表集合
|
2263
|
-
|
2264
|
-
# 系统数据库列表
|
2265
|
-
self.SYSTEM_DATABASES = {
|
2266
|
-
'information_schema', 'mysql',
|
2267
|
-
'performance_schema', 'sys'
|
2268
|
-
}
|
2269
|
-
|
2270
|
-
def _get_connection(self):
|
2271
|
-
"""从连接池获取连接"""
|
2272
|
-
try:
|
2273
|
-
conn = self.pool.connection()
|
2274
|
-
logger.debug("成功获取数据库连接")
|
2275
|
-
return conn
|
2276
|
-
except Exception as e:
|
2277
|
-
logger.error(f"获取数据库连接失败: {str(e)}")
|
2278
|
-
raise ConnectionError(f"连接数据库失败: {str(e)}")
|
2279
|
-
|
2280
|
-
@staticmethod
|
2281
|
-
def _retry_on_failure(func):
|
2282
|
-
"""重试装饰器"""
|
2283
|
-
|
2284
|
-
@wraps(func)
|
2285
|
-
def wrapper(self, *args, **kwargs):
|
2286
|
-
last_exception = None
|
2287
|
-
for attempt in range(self.max_retries + 1):
|
2288
|
-
try:
|
2289
|
-
return func(self, *args, **kwargs)
|
2290
|
-
except (pymysql.OperationalError, pymysql.InterfaceError) as e:
|
2291
|
-
last_exception = e
|
2292
|
-
if attempt < self.max_retries:
|
2293
|
-
wait_time = self.retry_interval * (attempt + 1)
|
2294
|
-
logger.warning(
|
2295
|
-
f"数据库操作失败,准备重试 (尝试 {attempt + 1}/{self.max_retries})",
|
2296
|
-
{'error': str(e), 'wait_time': wait_time})
|
2297
|
-
time.sleep(wait_time)
|
2298
|
-
continue
|
2299
|
-
except Exception as e:
|
2300
|
-
last_exception = e
|
2301
|
-
logger.error(f"操作失败: {str(e)}", {'error_type': type(e).__name__})
|
2302
|
-
break
|
2303
|
-
|
2304
|
-
if last_exception:
|
2305
|
-
raise last_exception
|
2306
|
-
raise Exception("未知错误")
|
2307
|
-
|
2308
|
-
return wrapper
|
2309
|
-
|
2310
|
-
@_retry_on_failure
|
2311
|
-
def _get_databases(self) -> List[str]:
|
2312
|
-
"""获取所有非系统数据库列表"""
|
2313
|
-
sql = "SHOW DATABASES"
|
2314
|
-
|
2315
|
-
with self._get_connection() as conn:
|
2316
|
-
with conn.cursor() as cursor:
|
2317
|
-
cursor.execute(sql)
|
2318
|
-
all_dbs = [row['Database'] for row in cursor.fetchall()]
|
2319
|
-
|
2320
|
-
if self.skip_system_dbs:
|
2321
|
-
return [db for db in all_dbs if db.lower() not in self.SYSTEM_DATABASES]
|
2322
|
-
return all_dbs
|
2323
|
-
|
2324
|
-
@_retry_on_failure
|
2325
|
-
def _get_tables(self, database: str) -> List[str]:
|
2326
|
-
"""获取指定数据库的所有表"""
|
2327
|
-
sql = "SHOW TABLES"
|
2328
|
-
|
2329
|
-
with self._get_connection() as conn:
|
2330
|
-
with conn.cursor() as cursor:
|
2331
|
-
cursor.execute(f"USE `{database}`")
|
2332
|
-
cursor.execute(sql)
|
2333
|
-
return [row[f'Tables_in_{database}'] for row in cursor.fetchall()]
|
2334
|
-
|
2335
|
-
@_retry_on_failure
|
2336
|
-
def _get_table_columns(self, database: str, table: str) -> List[str]:
|
2337
|
-
"""获取表的列名(排除id列)"""
|
2338
|
-
sql = """
|
2339
|
-
SELECT COLUMN_NAME
|
2340
|
-
FROM INFORMATION_SCHEMA.COLUMNS
|
2341
|
-
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
2342
|
-
ORDER BY ORDINAL_POSITION
|
2343
|
-
"""
|
2344
|
-
|
2345
|
-
with self._get_connection() as conn:
|
2346
|
-
with conn.cursor() as cursor:
|
2347
|
-
cursor.execute(sql, (database, table))
|
2348
|
-
return [row['COLUMN_NAME'] for row in cursor.fetchall()
|
2349
|
-
if row['COLUMN_NAME'].lower() != 'id']
|
2350
|
-
|
2351
|
-
def _acquire_table_lock(self, database: str, table: str) -> bool:
|
2352
|
-
"""获取表处理锁,防止并发处理同一张表"""
|
2353
|
-
key = f"{database}.{table}"
|
2354
|
-
|
2355
|
-
with self._lock:
|
2356
|
-
if key in self._processing_tables:
|
2357
|
-
logger.debug(f"表 {key} 正在被其他线程处理,跳过")
|
2358
|
-
return False
|
2359
|
-
self._processing_tables.add(key)
|
2360
|
-
return True
|
2361
|
-
|
2362
|
-
def _release_table_lock(self, database: str, table: str):
|
2363
|
-
"""释放表处理锁"""
|
2364
|
-
key = f"{database}.{table}"
|
2365
|
-
|
2366
|
-
with self._lock:
|
2367
|
-
if key in self._processing_tables:
|
2368
|
-
self._processing_tables.remove(key)
|
2369
|
-
|
2370
|
-
def _deduplicate_table(
|
2371
|
-
self,
|
2372
|
-
database: str,
|
2373
|
-
table: str,
|
2374
|
-
columns: Optional[List[str]] = None,
|
2375
|
-
dry_run: bool = False
|
2376
|
-
) -> Tuple[int, int]:
|
2377
|
-
"""
|
2378
|
-
执行单表去重
|
2379
|
-
|
2380
|
-
:param database: 数据库名
|
2381
|
-
:param table: 表名
|
2382
|
-
:param columns: 用于去重的列(为None时使用所有列)
|
2383
|
-
:param dry_run: 是否模拟运行(只统计不实际删除)
|
2384
|
-
:return: (重复行数, 删除行数)
|
2385
|
-
"""
|
2386
|
-
if not self._acquire_table_lock(database, table):
|
2387
|
-
return (0, 0)
|
2388
|
-
|
2389
|
-
try:
|
2390
|
-
logger.info(f"开始处理表: {database}.{table}")
|
2391
|
-
|
2392
|
-
# 获取实际列名
|
2393
|
-
all_columns = self._get_table_columns(database, table)
|
2394
|
-
if not all_columns:
|
2395
|
-
logger.warning(f"表 {database}.{table} 没有有效列(可能只有id列),跳过")
|
2396
|
-
return (0, 0)
|
2397
|
-
|
2398
|
-
# 使用指定列或所有列
|
2399
|
-
use_columns = columns or all_columns
|
2400
|
-
invalid_columns = set(use_columns) - set(all_columns)
|
2401
|
-
|
2402
|
-
if invalid_columns:
|
2403
|
-
logger.warning(
|
2404
|
-
f"表 {database}.{table} 中不存在以下列: {invalid_columns},使用有效列",
|
2405
|
-
{'invalid_columns': invalid_columns}
|
2406
|
-
)
|
2407
|
-
use_columns = [col for col in use_columns if col in all_columns]
|
2408
|
-
|
2409
|
-
if not use_columns:
|
2410
|
-
logger.error(f"表 {database}.{table} 没有有效的去重列")
|
2411
|
-
return (0, 0)
|
2412
|
-
|
2413
|
-
# 构建去重SQL
|
2414
|
-
column_list = ', '.join([f'`{col}`' for col in use_columns])
|
2415
|
-
temp_table = f"temp_{table}_{int(time.time())}"
|
2416
|
-
|
2417
|
-
# 使用临时表方案处理去重,避免锁表问题
|
2418
|
-
create_temp_sql = f"""
|
2419
|
-
CREATE TABLE `{database}`.`{temp_table}` AS
|
2420
|
-
SELECT MIN(`id`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
|
2421
|
-
FROM `{database}`.`{table}`
|
2422
|
-
GROUP BY {column_list}
|
2423
|
-
HAVING COUNT(*) > 1
|
2424
|
-
"""
|
2425
|
-
|
2426
|
-
delete_dup_sql = f"""
|
2427
|
-
DELETE FROM `{database}`.`{table}`
|
2428
|
-
WHERE `id` NOT IN (
|
2429
|
-
SELECT `min_id` FROM `{database}`.`{temp_table}`
|
2430
|
-
) AND ({' OR '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
|
2431
|
-
"""
|
2432
|
-
|
2433
|
-
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
2434
|
-
|
2435
|
-
with self._get_connection() as conn:
|
2436
|
-
with conn.cursor() as cursor:
|
2437
|
-
# 创建临时表统计重复数据
|
2438
|
-
cursor.execute(create_temp_sql)
|
2439
|
-
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`")
|
2440
|
-
dup_count = cursor.fetchone()['cnt']
|
2441
|
-
|
2442
|
-
if dup_count == 0:
|
2443
|
-
logger.info(f"表 {database}.{table} 没有重复数据")
|
2444
|
-
cursor.execute(drop_temp_sql)
|
2445
|
-
conn.commit()
|
2446
|
-
return (0, 0)
|
2447
|
-
|
2448
|
-
logger.info(
|
2449
|
-
f"表 {database}.{table} 发现 {dup_count} 组重复数据",
|
2450
|
-
{'columns': use_columns}
|
2451
|
-
)
|
2452
|
-
|
2453
|
-
if not dry_run:
|
2454
|
-
# 执行实际删除
|
2455
|
-
cursor.execute(delete_dup_sql)
|
2456
|
-
affected_rows = cursor.rowcount
|
2457
|
-
conn.commit()
|
2458
|
-
logger.info(
|
2459
|
-
f"表 {database}.{table} 已删除 {affected_rows} 行重复数据",
|
2460
|
-
{'columns': use_columns}
|
2461
|
-
)
|
2462
|
-
else:
|
2463
|
-
affected_rows = 0
|
2464
|
-
logger.info(
|
2465
|
-
f"[模拟运行] 表 {database}.{table} 将删除 {dup_count} 组重复数据",
|
2466
|
-
{'columns': use_columns}
|
2467
|
-
)
|
2468
|
-
|
2469
|
-
# 清理临时表
|
2470
|
-
cursor.execute(drop_temp_sql)
|
2471
|
-
conn.commit()
|
2472
|
-
|
2473
|
-
return (dup_count, affected_rows)
|
2474
|
-
|
2475
|
-
except Exception as e:
|
2476
|
-
logger.error(
|
2477
|
-
f"处理表 {database}.{table} 时出错: {str(e)}",
|
2478
|
-
{'error_type': type(e).__name__}
|
2479
|
-
)
|
2480
|
-
return (0, 0)
|
2481
|
-
finally:
|
2482
|
-
self._release_table_lock(database, table)
|
2483
|
-
|
2484
|
-
def deduplicate_table(
|
2485
|
-
self,
|
2486
|
-
database: str,
|
2487
|
-
table: str,
|
2488
|
-
columns: Optional[List[str]] = None,
|
2489
|
-
dry_run: bool = False
|
2490
|
-
) -> Tuple[int, int]:
|
2491
|
-
"""
|
2492
|
-
对指定表进行去重
|
2493
|
-
|
2494
|
-
:param database: 数据库名
|
2495
|
-
:param table: 表名
|
2496
|
-
:param columns: 用于去重的列(为None时使用所有列)
|
2497
|
-
:param dry_run: 是否模拟运行(只统计不实际删除)
|
2498
|
-
:return: (重复行数, 删除行数)
|
2499
|
-
"""
|
2500
|
-
try:
|
2501
|
-
# 检查表是否存在
|
2502
|
-
if not self._check_table_exists(database, table):
|
2503
|
-
logger.warning(f"表 {database}.{table} 不存在,跳过")
|
2504
|
-
return (0, 0)
|
2505
|
-
|
2506
|
-
return self._deduplicate_table(database, table, columns, dry_run)
|
2507
|
-
except Exception as e:
|
2508
|
-
logger.error(
|
2509
|
-
f"处理表 {database}.{table} 时发生全局错误: {str(e)}",
|
2510
|
-
{'error_type': type(e).__name__}
|
2511
|
-
)
|
2512
|
-
return (0, 0)
|
2513
|
-
|
2514
|
-
def deduplicate_database(
|
2515
|
-
self,
|
2516
|
-
database: str,
|
2517
|
-
tables: Optional[List[str]] = None,
|
2518
|
-
columns_map: Optional[Dict[str, List[str]]] = None,
|
2519
|
-
dry_run: bool = False,
|
2520
|
-
parallel: bool = False
|
2521
|
-
) -> Dict[str, Tuple[int, int]]:
|
2522
|
-
"""
|
2523
|
-
对指定数据库的所有表进行去重
|
2524
|
-
|
2525
|
-
:param database: 数据库名
|
2526
|
-
:param tables: 要处理的表列表(为None时处理所有表)
|
2527
|
-
:param columns_map: 各表使用的去重列 {表名: [列名]}
|
2528
|
-
:param dry_run: 是否模拟运行
|
2529
|
-
:param parallel: 是否并行处理
|
2530
|
-
:return: 字典 {表名: (重复行数, 删除行数)}
|
2531
|
-
"""
|
2532
|
-
results = {}
|
2533
|
-
|
2534
|
-
try:
|
2535
|
-
# 检查数据库是否存在
|
2536
|
-
if not self._check_database_exists(database):
|
2537
|
-
logger.warning(f"数据库 {database} 不存在,跳过")
|
2538
|
-
return results
|
2539
|
-
|
2540
|
-
# 获取要处理的表
|
2541
|
-
target_tables = tables or self._get_tables(database)
|
2542
|
-
if not target_tables:
|
2543
|
-
logger.info(f"数据库 {database} 中没有表,跳过")
|
2544
|
-
return results
|
2545
|
-
|
2546
|
-
logger.info(
|
2547
|
-
f"开始处理数据库 {database} 中的 {len(target_tables)} 张表",
|
2548
|
-
{'tables': target_tables}
|
2549
|
-
)
|
2550
|
-
|
2551
|
-
if parallel and self.max_workers > 1:
|
2552
|
-
# 并行处理
|
2553
|
-
with concurrent.futures.ThreadPoolExecutor(
|
2554
|
-
max_workers=self.max_workers
|
2555
|
-
) as executor:
|
2556
|
-
futures = {}
|
2557
|
-
for table in target_tables:
|
2558
|
-
columns = columns_map.get(table) if columns_map else None
|
2559
|
-
futures[executor.submit(
|
2560
|
-
self.deduplicate_table,
|
2561
|
-
database, table, columns, dry_run
|
2562
|
-
)] = table
|
2563
|
-
|
2564
|
-
for future in concurrent.futures.as_completed(futures):
|
2565
|
-
table = futures[future]
|
2566
|
-
try:
|
2567
|
-
dup_count, affected_rows = future.result()
|
2568
|
-
results[table] = (dup_count, affected_rows)
|
2569
|
-
except Exception as e:
|
2570
|
-
logger.error(
|
2571
|
-
f"处理表 {database}.{table} 时出错: {str(e)}",
|
2572
|
-
{'error_type': type(e).__name__}
|
2573
|
-
)
|
2574
|
-
results[table] = (0, 0)
|
2575
|
-
else:
|
2576
|
-
# 串行处理
|
2577
|
-
for table in target_tables:
|
2578
|
-
columns = columns_map.get(table) if columns_map else None
|
2579
|
-
dup_count, affected_rows = self.deduplicate_table(
|
2580
|
-
database, table, columns, dry_run
|
2581
|
-
)
|
2582
|
-
results[table] = (dup_count, affected_rows)
|
2583
|
-
|
2584
|
-
# 统计结果
|
2585
|
-
total_dup = sum(r[0] for r in results.values())
|
2586
|
-
total_del = sum(r[1] for r in results.values())
|
2587
|
-
|
2588
|
-
logger.info(
|
2589
|
-
f"数据库 {database} 处理完成 - 共发现 {total_dup} 组重复数据,删除 {total_del} 行",
|
2590
|
-
{'results': results}
|
2591
|
-
)
|
2592
|
-
|
2593
|
-
return results
|
2594
|
-
|
2595
|
-
except Exception as e:
|
2596
|
-
logger.error(f"处理数据库 {database} 时发生全局错误: {str(e)}", {'error_type': type(e).__name__})
|
2597
|
-
return results
|
2598
|
-
|
2599
|
-
def deduplicate_all(
|
2600
|
-
self,
|
2601
|
-
databases: Optional[List[str]] = None,
|
2602
|
-
tables_map: Optional[Dict[str, List[str]]] = None,
|
2603
|
-
columns_map: Optional[Dict[str, Dict[str, List[str]]]] = None,
|
2604
|
-
dry_run: bool = False,
|
2605
|
-
parallel: bool = False
|
2606
|
-
) -> Dict[str, Dict[str, Tuple[int, int]]]:
|
2607
|
-
"""
|
2608
|
-
对所有数据库进行去重
|
2609
|
-
|
2610
|
-
:param databases: 要处理的数据库列表(为None时处理所有非系统数据库)
|
2611
|
-
:param tables_map: 各数据库要处理的表 {数据库名: [表名]}
|
2612
|
-
:param columns_map: 各表使用的去重列 {数据库名: {表名: [列名]}}
|
2613
|
-
:param dry_run: 是否模拟运行
|
2614
|
-
:param parallel: 是否并行处理
|
2615
|
-
:return: 嵌套字典 {数据库名: {表名: (重复行数, 删除行数)}}
|
2616
|
-
"""
|
2617
|
-
all_results = defaultdict(dict)
|
2618
|
-
|
2619
|
-
try:
|
2620
|
-
# 获取要处理的数据库
|
2621
|
-
target_dbs = databases or self._get_databases()
|
2622
|
-
if not target_dbs:
|
2623
|
-
logger.warning("没有可处理的数据库")
|
2624
|
-
return all_results
|
2625
|
-
|
2626
|
-
logger.info(f"开始处理 {len(target_dbs)} 个数据库", {'databases': target_dbs})
|
2627
|
-
|
2628
|
-
if parallel and self.max_workers > 1:
|
2629
|
-
# 并行处理数据库
|
2630
|
-
with concurrent.futures.ThreadPoolExecutor(
|
2631
|
-
max_workers=self.max_workers
|
2632
|
-
) as executor:
|
2633
|
-
futures = {}
|
2634
|
-
for db in target_dbs:
|
2635
|
-
tables = tables_map.get(db) if tables_map else None
|
2636
|
-
db_columns_map = columns_map.get(db) if columns_map else None
|
2637
|
-
futures[executor.submit(
|
2638
|
-
self.deduplicate_database,
|
2639
|
-
db, tables, db_columns_map, dry_run, False
|
2640
|
-
)] = db
|
2641
|
-
|
2642
|
-
for future in concurrent.futures.as_completed(futures):
|
2643
|
-
db = futures[future]
|
2644
|
-
try:
|
2645
|
-
db_results = future.result()
|
2646
|
-
all_results[db] = db_results
|
2647
|
-
except Exception as e:
|
2648
|
-
logger.error(f"处理数据库 {db} 时出错: {str(e)}", {'error_type': type(e).__name__})
|
2649
|
-
all_results[db] = {}
|
2650
|
-
else:
|
2651
|
-
# 串行处理数据库
|
2652
|
-
for db in target_dbs:
|
2653
|
-
tables = tables_map.get(db) if tables_map else None
|
2654
|
-
db_columns_map = columns_map.get(db) if columns_map else None
|
2655
|
-
db_results = self.deduplicate_database(
|
2656
|
-
db, tables, db_columns_map, dry_run, parallel
|
2657
|
-
)
|
2658
|
-
all_results[db] = db_results
|
2659
|
-
|
2660
|
-
# 统计总体结果
|
2661
|
-
total_dup = sum(
|
2662
|
-
r[0] for db in all_results.values()
|
2663
|
-
for r in db.values()
|
2664
|
-
)
|
2665
|
-
total_del = sum(
|
2666
|
-
r[1] for db in all_results.values()
|
2667
|
-
for r in db.values()
|
2668
|
-
)
|
2669
|
-
|
2670
|
-
logger.info(
|
2671
|
-
f"所有数据库处理完成 - 共发现 {total_dup} 组重复数据,删除 {total_del} 行",
|
2672
|
-
{'total_results': all_results}
|
2673
|
-
)
|
2674
|
-
|
2675
|
-
return all_results
|
2676
|
-
|
2677
|
-
except Exception as e:
|
2678
|
-
logger.error(f"全局处理时发生错误: {str(e)}", {'error_type': type(e).__name__})
|
2679
|
-
return all_results
|
2680
|
-
|
2681
|
-
@_retry_on_failure
|
2682
|
-
def _check_database_exists(self, database: str) -> bool:
|
2683
|
-
"""检查数据库是否存在"""
|
2684
|
-
sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
|
2685
|
-
|
2686
|
-
with self._get_connection() as conn:
|
2687
|
-
with conn.cursor() as cursor:
|
2688
|
-
cursor.execute(sql, (database,))
|
2689
|
-
return bool(cursor.fetchone())
|
2690
|
-
|
2691
|
-
@_retry_on_failure
|
2692
|
-
def _check_table_exists(self, database: str, table: str) -> bool:
|
2693
|
-
"""检查表是否存在"""
|
2694
|
-
sql = """
|
2695
|
-
SELECT TABLE_NAME
|
2696
|
-
FROM INFORMATION_SCHEMA.TABLES
|
2697
|
-
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
2698
|
-
"""
|
2699
|
-
|
2700
|
-
with self._get_connection() as conn:
|
2701
|
-
with conn.cursor() as cursor:
|
2702
|
-
cursor.execute(sql, (database, table))
|
2703
|
-
return bool(cursor.fetchone())
|
2704
|
-
|
2705
|
-
def close(self):
|
2706
|
-
"""关闭连接池"""
|
2707
|
-
try:
|
2708
|
-
if hasattr(self, 'pool') and self.pool:
|
2709
|
-
self.pool.close()
|
2710
|
-
logger.info("数据库连接池已关闭")
|
2711
|
-
except Exception as e:
|
2712
|
-
logger.error(f"关闭连接池时出错: {str(e)}", {'error_type': type(e).__name__})
|
2713
|
-
finally:
|
2714
|
-
self.pool = None
|
2715
|
-
|
2716
|
-
def __enter__(self):
|
2717
|
-
return self
|
2718
|
-
|
2719
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
2720
|
-
self.close()
|
2721
|
-
|
2722
|
-
|
2723
|
-
def main():
|
2724
|
-
uploader = MySQLUploader(
|
2725
|
-
username='root',
|
2726
|
-
password='pw',
|
2727
|
-
host='localhost',
|
2728
|
-
port=3306,
|
2729
|
-
)
|
2730
|
-
|
2731
|
-
# 定义列和数据类型
|
2732
|
-
set_typ = {
|
2733
|
-
'name': 'VARCHAR(255)',
|
2734
|
-
'age': 'INT',
|
2735
|
-
'salary': 'DECIMAL(10,2)',
|
2736
|
-
'日期': 'DATE',
|
2737
|
-
'shop': None,
|
2738
|
-
}
|
2739
|
-
|
2740
|
-
# 准备数据
|
2741
|
-
data = [
|
2742
|
-
{'日期': '2023-01-8', 'name': 'JACk', 'AGE': '24', 'salary': 555.1545},
|
2743
|
-
{'日期': '2023-01-15', 'name': 'Alice', 'AGE': 35, 'salary': 100},
|
2744
|
-
{'日期': '2023-01-15', 'name': 'Alice', 'AGE': 30, 'salary': 0.0},
|
2745
|
-
{'日期': '2023-02-20', 'name': 'Bob', 'AGE': 25, 'salary': 45000.75}
|
2746
|
-
]
|
2747
|
-
|
2748
|
-
# 上传数据
|
2749
|
-
uploader.upload_data(
|
2750
|
-
db_name='测试库',
|
2751
|
-
table_name='测试表',
|
2752
|
-
data=data,
|
2753
|
-
set_typ=set_typ, # 定义列和数据类型
|
2754
|
-
primary_keys=[], # 创建唯一主键
|
2755
|
-
check_duplicate=False, # 检查重复数据
|
2756
|
-
duplicate_columns=[], # 指定排重的组合键
|
2757
|
-
allow_null=False, # 允许插入空值
|
2758
|
-
partition_by='year', # 按月分表
|
2759
|
-
partition_date_column='日期', # 用于分表的日期列名,默认为'日期'
|
2760
|
-
auto_create=True, # 表不存在时自动创建, 默认参数不要更改
|
2761
|
-
indexes=[], # 指定索引列
|
2762
|
-
)
|
2763
|
-
|
2764
|
-
uploader.close()
|
2765
|
-
|
2766
|
-
|
2767
|
-
def main2():
|
2768
|
-
deduplicator = MySQLDeduplicator(
|
2769
|
-
username='root',
|
2770
|
-
password='pw',
|
2771
|
-
host='localhost',
|
2772
|
-
port=3306
|
2773
|
-
)
|
2774
|
-
|
2775
|
-
# 全库去重(单线程)
|
2776
|
-
deduplicator.deduplicate_all()
|
2777
|
-
|
2778
|
-
# # 指定数据库去重(多线程)
|
2779
|
-
# deduplicator.deduplicate_database('my_db', parallel=True)
|
2780
|
-
|
2781
|
-
# # 指定表去重(使用特定列)
|
2782
|
-
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
|
2783
|
-
|
2784
|
-
# 关闭连接
|
2785
|
-
deduplicator.close()
|
2786
|
-
|
2787
1119
|
if __name__ == '__main__':
|
2788
1120
|
pass
|