mdbq 4.1.14__py3-none-any.whl → 4.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mdbq might be problematic. Click here for more details.
- mdbq/__version__.py +1 -1
- mdbq/mysql/uploader.py +740 -2765
- mdbq/other/download_sku_picture.py +15 -36
- {mdbq-4.1.14.dist-info → mdbq-4.2.1.dist-info}/METADATA +2 -2
- {mdbq-4.1.14.dist-info → mdbq-4.2.1.dist-info}/RECORD +7 -7
- {mdbq-4.1.14.dist-info → mdbq-4.2.1.dist-info}/WHEEL +1 -1
- {mdbq-4.1.14.dist-info → mdbq-4.2.1.dist-info}/top_level.txt +0 -0
mdbq/mysql/uploader.py
CHANGED
|
@@ -1,659 +1,278 @@
|
|
|
1
1
|
# -*- coding:utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
MySQL数据上传器 - 重构版本
|
|
4
|
+
提供高可用、易维护的MySQL数据上传功能
|
|
5
|
+
"""
|
|
6
|
+
|
|
2
7
|
import datetime
|
|
3
|
-
import re
|
|
4
8
|
import time
|
|
9
|
+
import json
|
|
10
|
+
import re
|
|
11
|
+
from typing import Union, List, Dict, Optional, Any, Tuple
|
|
5
12
|
from functools import wraps
|
|
6
|
-
import
|
|
13
|
+
from decimal import Decimal, InvalidOperation
|
|
14
|
+
import math
|
|
15
|
+
|
|
7
16
|
import pymysql
|
|
8
17
|
import pandas as pd
|
|
9
|
-
import os
|
|
10
|
-
from mdbq.log import mylogger
|
|
11
|
-
from mdbq.myconf import myconf
|
|
12
|
-
from typing import Union, List, Dict, Optional, Any, Tuple, Set
|
|
13
18
|
from dbutils.pooled_db import PooledDB
|
|
14
|
-
from
|
|
15
|
-
import
|
|
16
|
-
import json
|
|
19
|
+
from mdbq.log import mylogger
|
|
20
|
+
# from mdbq.myconf import myconf
|
|
17
21
|
|
|
18
|
-
|
|
22
|
+
# 配置日志
|
|
19
23
|
logger = mylogger.MyLogger(
|
|
20
24
|
logging_mode='file',
|
|
21
25
|
log_level='info',
|
|
22
26
|
log_format='json',
|
|
23
27
|
max_log_size=50,
|
|
24
28
|
backup_count=5,
|
|
25
|
-
enable_async=False,
|
|
26
|
-
sample_rate=1,
|
|
27
|
-
sensitive_fields=[],
|
|
28
|
-
enable_metrics=False,
|
|
29
|
+
enable_async=False,
|
|
30
|
+
sample_rate=1,
|
|
31
|
+
sensitive_fields=[],
|
|
32
|
+
enable_metrics=False,
|
|
29
33
|
)
|
|
30
34
|
|
|
31
35
|
|
|
32
|
-
|
|
33
|
-
"""
|
|
34
|
-
统计小数点前后位数,支持科学计数法。
|
|
35
|
-
返回:(整数位数, 小数位数)
|
|
36
|
-
"""
|
|
37
|
-
try:
|
|
38
|
-
d = Decimal(str(num_str))
|
|
39
|
-
sign, digits, exponent = d.as_tuple()
|
|
40
|
-
int_part = len(digits) + exponent if exponent < 0 else len(digits)
|
|
41
|
-
dec_part = -exponent if exponent < 0 else 0
|
|
42
|
-
return max(int_part, 0), max(dec_part, 0)
|
|
43
|
-
except (InvalidOperation, ValueError, TypeError):
|
|
44
|
-
return (0, 0)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class StatementCache(dict):
|
|
48
|
-
"""LRU缓存实现,用于SQL语句缓存"""
|
|
49
|
-
def __init__(self, maxsize=100):
|
|
50
|
-
super().__init__()
|
|
51
|
-
self._maxsize = maxsize
|
|
52
|
-
self._order = []
|
|
53
|
-
def __getitem__(self, key):
|
|
54
|
-
value = super().__getitem__(key)
|
|
55
|
-
self._order.remove(key)
|
|
56
|
-
self._order.append(key)
|
|
57
|
-
return value
|
|
58
|
-
def __setitem__(self, key, value):
|
|
59
|
-
if key in self:
|
|
60
|
-
self._order.remove(key)
|
|
61
|
-
elif len(self._order) >= self._maxsize:
|
|
62
|
-
oldest = self._order.pop(0)
|
|
63
|
-
super().__delitem__(oldest)
|
|
64
|
-
super().__setitem__(key, value)
|
|
65
|
-
self._order.append(key)
|
|
66
|
-
def get(self, key, default=None):
|
|
67
|
-
if key in self:
|
|
68
|
-
return self[key]
|
|
69
|
-
return default
|
|
70
|
-
|
|
71
|
-
class MySQLUploader:
|
|
72
|
-
"""
|
|
73
|
-
MySQL数据上传
|
|
36
|
+
class DatabaseConnectionManager:
|
|
37
|
+
"""数据库连接管理器"""
|
|
74
38
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
"""
|
|
78
|
-
def __init__(
|
|
79
|
-
self,
|
|
80
|
-
username: str,
|
|
81
|
-
password: str,
|
|
82
|
-
host: str = 'localhost',
|
|
83
|
-
port: int = 3306,
|
|
84
|
-
charset: str = 'utf8mb4',
|
|
85
|
-
collation: str = 'utf8mb4_0900_ai_ci',
|
|
86
|
-
max_retries: int = 10,
|
|
87
|
-
retry_waiting_time: int = 10,
|
|
88
|
-
pool_size: int = 5,
|
|
89
|
-
mincached: int = 0,
|
|
90
|
-
maxcached: int = 0,
|
|
91
|
-
connect_timeout: int = 10,
|
|
92
|
-
read_timeout: int = 30,
|
|
93
|
-
write_timeout: int = 30,
|
|
94
|
-
ssl: Optional[Dict] = None
|
|
95
|
-
):
|
|
96
|
-
"""
|
|
97
|
-
初始化MySQL上传器
|
|
98
|
-
|
|
99
|
-
:param username: 数据库用户名
|
|
100
|
-
:param password: 数据库密码
|
|
101
|
-
:param host: 数据库主机地址,默认为localhost
|
|
102
|
-
:param port: 数据库端口,默认为3306
|
|
103
|
-
:param charset: 字符集,默认为utf8mb4
|
|
104
|
-
:param collation: 排序规则,默认为utf8mb4_0900_ai_ci,对大小写不敏感,utf8mb4_0900_as_cs/utf8mb4_bin: 对大小写敏感
|
|
105
|
-
:param max_retries: 最大重试次数,默认为10
|
|
106
|
-
:param retry_waiting_time: 重试间隔(秒),默认为10
|
|
107
|
-
:param pool_size: 连接池大小,默认为5
|
|
108
|
-
:param mincached: 空闲连接数量
|
|
109
|
-
:param maxcached: 最大空闲连接数, 0表示不设上限, 由连接池自动管理
|
|
110
|
-
:param connect_timeout: 连接超时(秒),默认为10
|
|
111
|
-
:param read_timeout: 读取超时(秒),默认为30
|
|
112
|
-
:param write_timeout: 写入超时(秒),默认为30
|
|
113
|
-
:param ssl: SSL配置字典,默认为None
|
|
114
|
-
:param auto_creat_missing_cols: 自动添加缺失列,默认为False,建议手动维护表结构
|
|
115
|
-
"""
|
|
116
|
-
self.username = username
|
|
117
|
-
self.password = password
|
|
118
|
-
self.host = host
|
|
119
|
-
self.port = int(port)
|
|
120
|
-
self.charset = charset
|
|
121
|
-
self.collation = collation
|
|
122
|
-
self.max_retries = max(max_retries, 1)
|
|
123
|
-
self.retry_waiting_time = max(retry_waiting_time, 1)
|
|
124
|
-
self.pool_size = max(pool_size, 1)
|
|
125
|
-
self.mincached = mincached
|
|
126
|
-
self.maxcached = maxcached
|
|
127
|
-
self.connect_timeout = connect_timeout
|
|
128
|
-
self.read_timeout = read_timeout
|
|
129
|
-
self.write_timeout = write_timeout
|
|
130
|
-
self.base_excute_col = ['id', '更新时间'] # 排重插入数据时始终排除该列
|
|
131
|
-
self.case_sensitive = False # 是否保持大小写敏感,默认为False(转为小写)
|
|
132
|
-
self.ssl = ssl
|
|
133
|
-
self._prepared_statements = StatementCache(maxsize=100)
|
|
134
|
-
self._max_cached_statements = 100 # 用于控制 StatementCache 类中缓存的 SQL 语句数量,最多缓存 100 条 SQL 语句
|
|
135
|
-
self._table_metadata_cache = {}
|
|
136
|
-
self.metadata_cache_ttl = 300 # 5分钟缓存时间
|
|
137
|
-
self.pool = self._create_connection_pool() # 创建连接池
|
|
138
|
-
self.auto_creat_missing_cols = False # 自动添加缺失列,正常不要自动添加,建议手动维护表结构
|
|
139
|
-
|
|
140
|
-
def _create_connection_pool(self) -> PooledDB:
|
|
141
|
-
"""
|
|
142
|
-
创建数据库连接池
|
|
143
|
-
|
|
144
|
-
:return: PooledDB连接池实例
|
|
145
|
-
:raises ConnectionError: 当连接池创建失败时抛出
|
|
146
|
-
"""
|
|
147
|
-
if hasattr(self, 'pool') and self.pool is not None and self._check_pool_health():
|
|
148
|
-
return self.pool
|
|
39
|
+
def __init__(self, config: Dict[str, Any]):
|
|
40
|
+
self.config = config
|
|
149
41
|
self.pool = None
|
|
42
|
+
self._create_pool()
|
|
43
|
+
|
|
44
|
+
def _create_pool(self):
|
|
45
|
+
"""创建连接池"""
|
|
150
46
|
pool_params = {
|
|
151
47
|
'creator': pymysql,
|
|
152
|
-
'host': self.host,
|
|
153
|
-
'port': self.port,
|
|
154
|
-
'user': self.username,
|
|
155
|
-
'password': self.password,
|
|
156
|
-
'charset': self.charset,
|
|
48
|
+
'host': self.config['host'],
|
|
49
|
+
'port': self.config['port'],
|
|
50
|
+
'user': self.config['username'],
|
|
51
|
+
'password': self.config['password'],
|
|
52
|
+
'charset': self.config['charset'],
|
|
157
53
|
'cursorclass': pymysql.cursors.DictCursor,
|
|
158
|
-
'maxconnections': self.pool_size,
|
|
159
|
-
'mincached': self.mincached,
|
|
160
|
-
'maxcached': self.maxcached,
|
|
54
|
+
'maxconnections': self.config['pool_size'],
|
|
55
|
+
'mincached': self.config.get('mincached', 0),
|
|
56
|
+
'maxcached': self.config.get('maxcached', 0),
|
|
161
57
|
'ping': 7,
|
|
162
|
-
'connect_timeout': self.connect_timeout,
|
|
163
|
-
'read_timeout': self.read_timeout,
|
|
164
|
-
'write_timeout': self.write_timeout,
|
|
58
|
+
'connect_timeout': self.config.get('connect_timeout', 10),
|
|
59
|
+
'read_timeout': self.config.get('read_timeout', 30),
|
|
60
|
+
'write_timeout': self.config.get('write_timeout', 30),
|
|
165
61
|
'autocommit': False
|
|
166
62
|
}
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
logger.error(error_msg)
|
|
172
|
-
raise ValueError(error_msg)
|
|
173
|
-
pool_params['ssl'] = {
|
|
174
|
-
'ca': self.ssl['ca'],
|
|
175
|
-
'cert': self.ssl['cert'],
|
|
176
|
-
'key': self.ssl['key'],
|
|
177
|
-
'check_hostname': self.ssl.get('check_hostname', False)
|
|
178
|
-
}
|
|
63
|
+
|
|
64
|
+
if self.config.get('ssl'):
|
|
65
|
+
pool_params['ssl'] = self.config['ssl']
|
|
66
|
+
|
|
179
67
|
try:
|
|
180
|
-
pool = PooledDB(**pool_params)
|
|
181
|
-
logger.debug('
|
|
182
|
-
return pool
|
|
68
|
+
self.pool = PooledDB(**pool_params)
|
|
69
|
+
logger.debug('数据库连接池创建成功', {'host': self.config['host']})
|
|
183
70
|
except Exception as e:
|
|
184
|
-
|
|
185
|
-
logger.error('连接池创建失败', {'error': str(e), 'host': self.host, 'port': self.port})
|
|
71
|
+
logger.error('连接池创建失败', {'error': str(e)})
|
|
186
72
|
raise ConnectionError(f'连接池创建失败: {str(e)}')
|
|
73
|
+
|
|
74
|
+
def get_connection(self):
|
|
75
|
+
"""获取数据库连接"""
|
|
76
|
+
if not self.pool:
|
|
77
|
+
self._create_pool()
|
|
78
|
+
return self.pool.connection()
|
|
79
|
+
|
|
80
|
+
def close(self):
|
|
81
|
+
"""关闭连接池"""
|
|
82
|
+
if self.pool:
|
|
83
|
+
self.pool = None
|
|
84
|
+
logger.debug('数据库连接池已关闭')
|
|
85
|
+
|
|
187
86
|
|
|
87
|
+
class DataTypeInferrer:
|
|
88
|
+
"""数据类型推断器"""
|
|
89
|
+
|
|
188
90
|
@staticmethod
|
|
189
|
-
def
|
|
190
|
-
"""
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
:
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
91
|
+
def infer_mysql_type(value: Any) -> str:
|
|
92
|
+
"""推断MySQL数据类型"""
|
|
93
|
+
if value is None or str(value).lower() in ['', 'none', 'nan']:
|
|
94
|
+
return 'VARCHAR(255)'
|
|
95
|
+
|
|
96
|
+
if isinstance(value, bool):
|
|
97
|
+
return 'TINYINT(1)'
|
|
98
|
+
elif isinstance(value, int):
|
|
99
|
+
if -2147483648 <= value <= 2147483647:
|
|
100
|
+
return 'INT'
|
|
101
|
+
else:
|
|
102
|
+
return 'BIGINT'
|
|
103
|
+
elif isinstance(value, float):
|
|
104
|
+
return 'DECIMAL(20,6)'
|
|
105
|
+
elif isinstance(value, (datetime.datetime, pd.Timestamp)):
|
|
106
|
+
return 'DATETIME'
|
|
107
|
+
elif isinstance(value, datetime.date):
|
|
108
|
+
return 'DATE'
|
|
109
|
+
elif isinstance(value, (list, dict)):
|
|
110
|
+
return 'JSON'
|
|
111
|
+
elif isinstance(value, str):
|
|
112
|
+
# 尝试判断是否是日期时间
|
|
113
|
+
if DataValidator.is_datetime_string(value):
|
|
114
|
+
return 'DATETIME'
|
|
115
|
+
|
|
116
|
+
# 根据字符串长度选择类型
|
|
117
|
+
length = len(value)
|
|
118
|
+
if length <= 255:
|
|
119
|
+
return 'VARCHAR(255)'
|
|
120
|
+
elif length <= 65535:
|
|
121
|
+
return 'TEXT'
|
|
122
|
+
else:
|
|
123
|
+
return 'LONGTEXT'
|
|
124
|
+
|
|
125
|
+
return 'VARCHAR(255)'
|
|
126
|
+
|
|
127
|
+
@staticmethod
|
|
128
|
+
def infer_types_from_data(data: List[Dict]) -> Dict[str, str]:
|
|
129
|
+
"""从数据中推断所有列的类型"""
|
|
130
|
+
if not data:
|
|
131
|
+
return {}
|
|
132
|
+
|
|
133
|
+
type_map = {}
|
|
134
|
+
for row in data[:10]: # 只检查前10行
|
|
135
|
+
for col, value in row.items():
|
|
136
|
+
# 跳过系统列
|
|
137
|
+
if col.lower() in ['id', 'create_at', 'update_at']:
|
|
138
|
+
continue
|
|
139
|
+
if col not in type_map and value is not None:
|
|
140
|
+
type_map[col] = DataTypeInferrer.infer_mysql_type(value)
|
|
141
|
+
|
|
142
|
+
# 自动添加系统列类型定义
|
|
143
|
+
type_map['id'] = 'BIGINT'
|
|
144
|
+
type_map['create_at'] = 'TIMESTAMP'
|
|
145
|
+
type_map['update_at'] = 'TIMESTAMP'
|
|
146
|
+
|
|
147
|
+
return type_map
|
|
238
148
|
|
|
239
|
-
@_execute_with_retry
|
|
240
|
-
def _get_connection(self) -> pymysql.connections.Connection:
|
|
241
|
-
"""
|
|
242
|
-
从连接池获取数据库连接
|
|
243
149
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
150
|
+
class DataValidator:
|
|
151
|
+
"""数据验证器"""
|
|
152
|
+
|
|
153
|
+
@staticmethod
|
|
154
|
+
def is_datetime_string(value: str) -> bool:
|
|
155
|
+
"""检查字符串是否为日期时间格式"""
|
|
156
|
+
formats = [
|
|
157
|
+
'%Y-%m-%d %H:%M:%S',
|
|
158
|
+
'%Y-%m-%d',
|
|
159
|
+
'%Y/%m/%d %H:%M:%S',
|
|
160
|
+
'%Y/%m/%d',
|
|
161
|
+
'%Y%m%d',
|
|
162
|
+
'%Y-%m-%dT%H:%M:%S',
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
for fmt in formats:
|
|
253
166
|
try:
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
:param db_name: 要创建的数据库名称
|
|
291
|
-
:raises: 可能抛出数据库相关异常
|
|
292
|
-
"""
|
|
293
|
-
db_name = self._validate_identifier(db_name, is_database=True)
|
|
294
|
-
sql = f'CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET {self.charset} COLLATE {self.collation}'
|
|
295
|
-
conn = None
|
|
296
|
-
try:
|
|
297
|
-
with self._get_connection() as conn:
|
|
298
|
-
with conn.cursor() as cursor:
|
|
299
|
-
cursor.execute(sql)
|
|
300
|
-
conn.commit()
|
|
301
|
-
logger.debug('数据库已创建', {'库': db_name})
|
|
302
|
-
except Exception as e:
|
|
303
|
-
logger.error('无法创建数据库', {'库': db_name, '错误': str(e)})
|
|
304
|
-
if conn is not None:
|
|
305
|
-
conn.rollback()
|
|
306
|
-
raise
|
|
307
|
-
|
|
308
|
-
def _get_partition_table_name(self, table_name: str, date_value: str, partition_by: str) -> str:
|
|
309
|
-
"""
|
|
310
|
-
获取分表名称
|
|
311
|
-
|
|
312
|
-
:param table_name: 基础表名
|
|
313
|
-
:param date_value: 日期值
|
|
314
|
-
:param partition_by: 分表方式 ('year' 或 'month' 或 'none')
|
|
315
|
-
:return: 分表名称
|
|
316
|
-
:raises ValueError: 如果日期格式无效或分表方式无效
|
|
317
|
-
"""
|
|
318
|
-
try:
|
|
319
|
-
date_obj = self._validate_datetime(value=date_value, date_type=True, no_log=False)
|
|
320
|
-
except ValueError:
|
|
321
|
-
logger.error('无效的日期格式', {'表': table_name, '日期值': date_value})
|
|
322
|
-
raise ValueError(f"`{table_name}` 无效的日期格式: `{date_value}`")
|
|
323
|
-
if partition_by == 'year':
|
|
324
|
-
return f"{table_name}_{date_obj.year}"
|
|
325
|
-
elif partition_by == 'month':
|
|
326
|
-
return f"{table_name}_{date_obj.year}_{date_obj.month:02d}"
|
|
327
|
-
else:
|
|
328
|
-
logger.error('分表方式无效', {'表': table_name, '分表方式': partition_by})
|
|
329
|
-
raise ValueError("分表方式必须是 'year' 或 'month' 或 'None'")
|
|
330
|
-
|
|
331
|
-
def _validate_identifier(self, identifier: str, is_database: bool = False) -> str:
|
|
332
|
-
"""
|
|
333
|
-
验证并清理数据库标识符(表名、列名等)
|
|
334
|
-
|
|
335
|
-
:param identifier: 要验证的标识符
|
|
336
|
-
:param is_database: 是否为数据库名,数据库名不能以数字开头
|
|
337
|
-
:return: 清理后的安全标识符
|
|
338
|
-
:raises ValueError: 当标识符无效时抛出
|
|
339
|
-
"""
|
|
340
|
-
if not identifier or not isinstance(identifier, str):
|
|
341
|
-
logger.error('无效的标识符', {'标识符': identifier})
|
|
342
|
-
raise ValueError(f"无效的标识符: `{identifier}`")
|
|
343
|
-
# 始终做特殊字符清理
|
|
344
|
-
cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '_', identifier)
|
|
345
|
-
cleaned = re.sub(r'_+', '_', cleaned).strip('_')
|
|
346
|
-
# 如果清理后为空字符串,使用默认标识符
|
|
347
|
-
if not cleaned:
|
|
348
|
-
logger.warning('标识符清理后为空,使用默认标识符', {'原始标识符': identifier})
|
|
349
|
-
# 使用原始标识符的哈希值作为后缀,保持可追溯性
|
|
350
|
-
import hashlib
|
|
351
|
-
hash_suffix = hashlib.md5(identifier.encode('utf-8')).hexdigest()[:8]
|
|
352
|
-
cleaned = f'unknown_col_{hash_suffix}'
|
|
353
|
-
|
|
354
|
-
# 数据库名不能以数字开头(MySQL要求),但表名和列名可以
|
|
355
|
-
if is_database and cleaned and cleaned[0].isdigit():
|
|
356
|
-
cleaned = f'db_{cleaned}'
|
|
357
|
-
logger.warning('为数字开头的数据库名添加db_前缀', {
|
|
358
|
-
'原始标识符': identifier,
|
|
359
|
-
'清理后': cleaned
|
|
360
|
-
})
|
|
167
|
+
datetime.datetime.strptime(value, fmt)
|
|
168
|
+
return True
|
|
169
|
+
except ValueError:
|
|
170
|
+
continue
|
|
171
|
+
return False
|
|
172
|
+
|
|
173
|
+
@staticmethod
|
|
174
|
+
def validate_and_convert_value(value: Any, mysql_type: str, allow_null: bool = False) -> Any:
|
|
175
|
+
"""验证并转换数据值"""
|
|
176
|
+
mysql_type_lower = mysql_type.lower()
|
|
177
|
+
|
|
178
|
+
# 处理空值
|
|
179
|
+
if value is None or (isinstance(value, str) and value.strip() == ''):
|
|
180
|
+
if allow_null:
|
|
181
|
+
return None
|
|
182
|
+
# 对于日期时间类型,直接返回默认的日期时间值
|
|
183
|
+
if 'datetime' in mysql_type_lower or 'timestamp' in mysql_type_lower:
|
|
184
|
+
return '2000-01-01 00:00:00'
|
|
185
|
+
elif 'date' in mysql_type_lower:
|
|
186
|
+
return '2000-01-01'
|
|
187
|
+
return DataValidator._get_default_value(mysql_type)
|
|
188
|
+
|
|
189
|
+
# 处理pandas的NaN值
|
|
190
|
+
if not isinstance(value, (list, dict)):
|
|
191
|
+
try:
|
|
192
|
+
if pd.isna(value) or (isinstance(value, float) and math.isinf(value)):
|
|
193
|
+
if allow_null:
|
|
194
|
+
return None
|
|
195
|
+
# 对于日期时间类型,直接返回默认的日期时间值
|
|
196
|
+
if 'datetime' in mysql_type_lower or 'timestamp' in mysql_type_lower:
|
|
197
|
+
return '2000-01-01 00:00:00'
|
|
198
|
+
elif 'date' in mysql_type_lower:
|
|
199
|
+
return '2000-01-01'
|
|
200
|
+
return DataValidator._get_default_value(mysql_type)
|
|
201
|
+
except (ValueError, TypeError):
|
|
202
|
+
pass
|
|
361
203
|
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
def _check_table_exists(self, db_name: str, table_name: str) -> bool:
|
|
375
|
-
"""
|
|
376
|
-
检查表是否存在
|
|
377
|
-
|
|
378
|
-
:param db_name: 数据库名
|
|
379
|
-
:param table_name: 表名
|
|
380
|
-
:return: 存在返回True,否则返回False
|
|
381
|
-
:raises: 可能抛出数据库相关异常
|
|
382
|
-
"""
|
|
383
|
-
cache_key = f"{db_name}.{table_name}"
|
|
384
|
-
if cache_key in self._table_metadata_cache:
|
|
385
|
-
cached_time, result = self._table_metadata_cache[cache_key]
|
|
386
|
-
if time.time() - cached_time < self.metadata_cache_ttl:
|
|
387
|
-
logger.debug('表存在缓存命中', {'库': db_name, '表': table_name, '存在': result})
|
|
388
|
-
return result
|
|
389
|
-
db_name = self._validate_identifier(db_name, is_database=True)
|
|
390
|
-
table_name = self._validate_identifier(table_name)
|
|
391
|
-
sql = """
|
|
392
|
-
SELECT TABLE_NAME
|
|
393
|
-
FROM INFORMATION_SCHEMA.TABLES
|
|
394
|
-
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
|
395
|
-
"""
|
|
396
|
-
try:
|
|
397
|
-
with self._get_connection() as conn:
|
|
398
|
-
with conn.cursor() as cursor:
|
|
399
|
-
cursor.execute(sql, (db_name, table_name))
|
|
400
|
-
result = bool(cursor.fetchone())
|
|
401
|
-
except Exception as e:
|
|
402
|
-
logger.error('检查数据表是否存在时发生未知错误', {'库': db_name, '表': table_name, '错误': str(e)})
|
|
403
|
-
raise
|
|
404
|
-
self._table_metadata_cache[cache_key] = (time.time(), result)
|
|
405
|
-
logger.debug('表存在检查', {'库': db_name, '表': table_name, '存在': result})
|
|
406
|
-
return result
|
|
407
|
-
|
|
408
|
-
@_execute_with_retry
|
|
409
|
-
def _create_table(
|
|
410
|
-
self,
|
|
411
|
-
db_name: str,
|
|
412
|
-
table_name: str,
|
|
413
|
-
set_typ: Dict[str, str],
|
|
414
|
-
primary_keys: Optional[List[str]] = None,
|
|
415
|
-
date_column: Optional[str] = None,
|
|
416
|
-
indexes: Optional[List[str]] = None,
|
|
417
|
-
allow_null: bool = False,
|
|
418
|
-
unique_keys: Optional[List[List[str]]] = None
|
|
419
|
-
) -> None:
|
|
420
|
-
"""
|
|
421
|
-
创建数据表,优化索引创建方式
|
|
422
|
-
"""
|
|
423
|
-
db_name = self._validate_identifier(db_name, is_database=True)
|
|
424
|
-
table_name = self._validate_identifier(table_name)
|
|
425
|
-
if not set_typ:
|
|
426
|
-
logger.error('建表时未指定set_typ', {'库': db_name, '表': table_name})
|
|
427
|
-
raise ValueError('set_typ 未指定')
|
|
428
|
-
# set_typ的键清洗
|
|
429
|
-
set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
|
|
430
|
-
|
|
431
|
-
# 处理id列和主键
|
|
432
|
-
column_defs = []
|
|
204
|
+
# JSON类型
|
|
205
|
+
if 'json' in mysql_type_lower:
|
|
206
|
+
if isinstance(value, (dict, list)):
|
|
207
|
+
return json.dumps(value, ensure_ascii=False)
|
|
208
|
+
elif isinstance(value, str):
|
|
209
|
+
try:
|
|
210
|
+
json.loads(value)
|
|
211
|
+
return value
|
|
212
|
+
except (TypeError, ValueError):
|
|
213
|
+
raise ValueError(f"无效的JSON字符串: {value}")
|
|
214
|
+
else:
|
|
215
|
+
return str(value)
|
|
433
216
|
|
|
434
|
-
#
|
|
435
|
-
if
|
|
436
|
-
|
|
217
|
+
# 日期时间类型
|
|
218
|
+
if 'datetime' in mysql_type_lower or 'timestamp' in mysql_type_lower:
|
|
219
|
+
return DataValidator._convert_to_datetime(value)
|
|
220
|
+
elif 'date' in mysql_type_lower:
|
|
221
|
+
return DataValidator._convert_to_date(value)
|
|
437
222
|
|
|
438
|
-
#
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
223
|
+
# 数值类型
|
|
224
|
+
elif 'int' in mysql_type_lower:
|
|
225
|
+
return DataValidator._convert_to_int(value)
|
|
226
|
+
elif any(t in mysql_type_lower for t in ['decimal', 'float', 'double']):
|
|
227
|
+
return DataValidator._convert_to_decimal(value)
|
|
442
228
|
|
|
443
|
-
#
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
col_type = set_typ.get(col, '').lower()
|
|
472
|
-
if 'varchar' in col_type:
|
|
473
|
-
m = re.search(r'varchar\((\d+)\)', col_type)
|
|
474
|
-
if m:
|
|
475
|
-
maxlen = int(m.group(1))
|
|
476
|
-
prefix_len = min(100, maxlen)
|
|
477
|
-
return f"`{self._normalize_col(col)}`({prefix_len})"
|
|
478
|
-
else:
|
|
479
|
-
return f"`{self._normalize_col(col)}`(100)"
|
|
480
|
-
elif 'text' in col_type:
|
|
481
|
-
return f"`{self._normalize_col(col)}`(100)"
|
|
482
|
-
else:
|
|
483
|
-
return f"`{self._normalize_col(col)}`"
|
|
484
|
-
|
|
485
|
-
# 处理主键
|
|
486
|
-
if primary_keys and len(primary_keys) > 0:
|
|
487
|
-
# 验证主键列是否存在于set_typ中
|
|
488
|
-
valid_primary_keys = []
|
|
489
|
-
for pk in primary_keys:
|
|
490
|
-
normalized_pk = self._normalize_col(pk)
|
|
491
|
-
if normalized_pk in set_typ:
|
|
492
|
-
valid_primary_keys.append(pk)
|
|
493
|
-
else:
|
|
494
|
-
logger.warning('主键列不存在于表结构中,跳过', {
|
|
495
|
-
'库': db_name,
|
|
496
|
-
'表': table_name,
|
|
497
|
-
'列': pk,
|
|
498
|
-
'规范化后': normalized_pk,
|
|
499
|
-
'可用列': list(set_typ.keys())
|
|
500
|
-
})
|
|
501
|
-
|
|
502
|
-
if valid_primary_keys:
|
|
503
|
-
# 如果指定了主键,直接使用指定的主键
|
|
504
|
-
safe_primary_keys = [_index_col_sql(pk) for pk in valid_primary_keys]
|
|
505
|
-
primary_key_sql = f"PRIMARY KEY ({','.join(safe_primary_keys)})"
|
|
229
|
+
# 字符串类型
|
|
230
|
+
elif 'varchar' in mysql_type_lower:
|
|
231
|
+
str_value = str(value)
|
|
232
|
+
# 检查长度限制
|
|
233
|
+
match = re.search(r'\((\d+)\)', mysql_type)
|
|
234
|
+
if match:
|
|
235
|
+
max_len = int(match.group(1))
|
|
236
|
+
if len(str_value.encode('utf-8')) > max_len:
|
|
237
|
+
return str_value.encode('utf-8')[:max_len].decode('utf-8', 'ignore')
|
|
238
|
+
return str_value
|
|
239
|
+
|
|
240
|
+
# 默认转为字符串
|
|
241
|
+
return str(value)
|
|
242
|
+
|
|
243
|
+
@staticmethod
|
|
244
|
+
def _get_default_value(mysql_type: str) -> Any:
|
|
245
|
+
"""获取MySQL类型的默认值"""
|
|
246
|
+
mysql_type_lower = mysql_type.lower()
|
|
247
|
+
|
|
248
|
+
if any(t in mysql_type_lower for t in ['int', 'bigint', 'tinyint', 'smallint']):
|
|
249
|
+
return 0
|
|
250
|
+
elif any(t in mysql_type_lower for t in ['decimal', 'float', 'double']):
|
|
251
|
+
return 0.0
|
|
252
|
+
elif any(t in mysql_type_lower for t in ['varchar', 'text', 'char']):
|
|
253
|
+
return 'none'
|
|
254
|
+
elif 'date' in mysql_type_lower:
|
|
255
|
+
if 'datetime' in mysql_type_lower:
|
|
256
|
+
return '2000-01-01 00:00:00'
|
|
506
257
|
else:
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
'表': table_name,
|
|
511
|
-
'原始主键': primary_keys
|
|
512
|
-
})
|
|
513
|
-
primary_key_sql = f"PRIMARY KEY (`id`)"
|
|
258
|
+
return '2000-01-01'
|
|
259
|
+
elif 'json' in mysql_type_lower:
|
|
260
|
+
return '{}'
|
|
514
261
|
else:
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
if date_column and date_column in set_typ:
|
|
521
|
-
safe_date_col = _index_col_sql(date_column)
|
|
522
|
-
index_defs.append(f"INDEX `idx_{self._normalize_col(date_column)}` ({safe_date_col})")
|
|
523
|
-
|
|
524
|
-
# 收集所有唯一约束中涉及的列,避免重复创建普通索引
|
|
525
|
-
unique_columns = set()
|
|
526
|
-
if unique_keys:
|
|
527
|
-
for unique_cols in unique_keys:
|
|
528
|
-
if unique_cols:
|
|
529
|
-
for col in unique_cols:
|
|
530
|
-
normalized_col = self._normalize_col(col)
|
|
531
|
-
if normalized_col in set_typ:
|
|
532
|
-
unique_columns.add(normalized_col)
|
|
533
|
-
|
|
534
|
-
if indexes:
|
|
535
|
-
for idx_col in indexes:
|
|
536
|
-
normalized_idx_col = self._normalize_col(idx_col)
|
|
537
|
-
if normalized_idx_col in set_typ:
|
|
538
|
-
# 检查是否与唯一约束冲突
|
|
539
|
-
if normalized_idx_col in unique_columns:
|
|
540
|
-
logger.warning('索引列已在唯一约束中定义,跳过普通索引', {
|
|
541
|
-
'库': db_name,
|
|
542
|
-
'表': table_name,
|
|
543
|
-
'列': idx_col,
|
|
544
|
-
'原因': '列已在唯一约束中定义'
|
|
545
|
-
})
|
|
546
|
-
continue
|
|
547
|
-
safe_idx_col = _index_col_sql(idx_col)
|
|
548
|
-
index_defs.append(f"INDEX `idx_{normalized_idx_col}` ({safe_idx_col})")
|
|
549
|
-
else:
|
|
550
|
-
logger.warning('索引列不存在于表结构中,跳过', {
|
|
551
|
-
'库': db_name,
|
|
552
|
-
'表': table_name,
|
|
553
|
-
'列': idx_col,
|
|
554
|
-
'规范化后': normalized_idx_col,
|
|
555
|
-
'可用列': list(set_typ.keys())
|
|
556
|
-
})
|
|
557
|
-
|
|
558
|
-
# UNIQUE KEY定义
|
|
559
|
-
unique_defs = []
|
|
560
|
-
if unique_keys:
|
|
561
|
-
for unique_cols in unique_keys:
|
|
562
|
-
if not unique_cols:
|
|
563
|
-
continue
|
|
564
|
-
# 检查唯一约束是否与主键冲突
|
|
565
|
-
if primary_keys:
|
|
566
|
-
# 如果唯一约束的列是主键的一部分,则跳过
|
|
567
|
-
if set(unique_cols).issubset(set(primary_keys)):
|
|
568
|
-
logger.warning('跳过与主键冲突的唯一约束', {
|
|
569
|
-
'库': db_name,
|
|
570
|
-
'表': table_name,
|
|
571
|
-
'唯一约束': unique_cols,
|
|
572
|
-
'主键': primary_keys
|
|
573
|
-
})
|
|
574
|
-
continue
|
|
575
|
-
|
|
576
|
-
# 验证唯一约束的列是否存在于set_typ中
|
|
577
|
-
valid_unique_cols = []
|
|
578
|
-
for col in unique_cols:
|
|
579
|
-
normalized_col = self._normalize_col(col)
|
|
580
|
-
if normalized_col in set_typ:
|
|
581
|
-
valid_unique_cols.append(col)
|
|
582
|
-
else:
|
|
583
|
-
logger.warning('唯一约束列不存在于表结构中,跳过', {
|
|
584
|
-
'库': db_name,
|
|
585
|
-
'表': table_name,
|
|
586
|
-
'列': col,
|
|
587
|
-
'规范化后': normalized_col,
|
|
588
|
-
'可用列': list(set_typ.keys())
|
|
589
|
-
})
|
|
590
|
-
|
|
591
|
-
if not valid_unique_cols:
|
|
592
|
-
logger.warning('唯一约束的所有列都不存在于表结构中,跳过整个约束', {
|
|
593
|
-
'库': db_name,
|
|
594
|
-
'表': table_name,
|
|
595
|
-
'原始约束': unique_cols
|
|
596
|
-
})
|
|
597
|
-
continue
|
|
598
|
-
|
|
599
|
-
safe_unique_cols = [_index_col_sql(col) for col in valid_unique_cols]
|
|
600
|
-
unique_name = f"uniq_{'_'.join([self._normalize_col(c) for c in valid_unique_cols])}"
|
|
601
|
-
unique_defs.append(f"UNIQUE KEY `{unique_name}` ({','.join(safe_unique_cols)})")
|
|
602
|
-
|
|
603
|
-
index_defs = list(set(index_defs))
|
|
604
|
-
all_defs = column_defs + [primary_key_sql] + index_defs + unique_defs
|
|
605
|
-
|
|
606
|
-
# 添加调试日志
|
|
607
|
-
logger.debug('建表SQL生成', {
|
|
608
|
-
'库': db_name,
|
|
609
|
-
'表': table_name,
|
|
610
|
-
'列定义': column_defs,
|
|
611
|
-
'主键': primary_key_sql,
|
|
612
|
-
'索引': index_defs,
|
|
613
|
-
'唯一约束': unique_defs,
|
|
614
|
-
'set_typ键': list(set_typ.keys())
|
|
615
|
-
})
|
|
616
|
-
|
|
617
|
-
sql = f"""
|
|
618
|
-
CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
|
|
619
|
-
{','.join(all_defs)}
|
|
620
|
-
) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
|
|
621
|
-
"""
|
|
622
|
-
conn = None
|
|
623
|
-
try:
|
|
624
|
-
with self._get_connection() as conn:
|
|
625
|
-
with conn.cursor() as cursor:
|
|
626
|
-
cursor.execute(sql)
|
|
627
|
-
conn.commit()
|
|
628
|
-
logger.debug('数据表及索引已创建', {'库': db_name, '表': table_name, '索引': indexes, '唯一约束': unique_keys})
|
|
629
|
-
except Exception as e:
|
|
630
|
-
logger.error('建表失败', {'库': db_name, '表': table_name, '错误': str(e), '异常类型': type(e).__name__})
|
|
631
|
-
if conn is not None:
|
|
632
|
-
conn.rollback()
|
|
633
|
-
raise
|
|
634
|
-
|
|
635
|
-
def _validate_datetime(self, value: str, date_type: bool = False, no_log: bool = False) -> Any:
|
|
636
|
-
"""
|
|
637
|
-
验证并标准化日期时间格式
|
|
638
|
-
|
|
639
|
-
:param value: 日期时间值
|
|
640
|
-
:param date_type: 是否返回日期类型(True)或字符串(False)
|
|
641
|
-
:param no_log: 记录日志,默认为False
|
|
642
|
-
:return: 标准化后的日期时间字符串或日期对象
|
|
643
|
-
:raises ValueError: 当日期格式无效时抛出
|
|
644
|
-
"""
|
|
645
|
-
# 处理 pandas Timestamp 对象
|
|
262
|
+
return 'none'
|
|
263
|
+
|
|
264
|
+
@staticmethod
|
|
265
|
+
def _convert_to_datetime(value: Any) -> str:
|
|
266
|
+
"""转换为datetime格式"""
|
|
646
267
|
if hasattr(value, 'strftime'):
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
268
|
+
return value.strftime('%Y-%m-%d %H:%M:%S')
|
|
269
|
+
|
|
270
|
+
value_str = str(value).strip()
|
|
271
|
+
|
|
272
|
+
# 处理特殊的无效值
|
|
273
|
+
if value_str.lower() in ['none', 'null', 'nan', '', 'nat']:
|
|
274
|
+
return '2000-01-01 00:00:00'
|
|
652
275
|
|
|
653
|
-
# 确保 value 是字符串
|
|
654
|
-
if not isinstance(value, str):
|
|
655
|
-
value = str(value)
|
|
656
|
-
|
|
657
276
|
formats = [
|
|
658
277
|
'%Y-%m-%d %H:%M:%S',
|
|
659
278
|
'%Y-%m-%d',
|
|
@@ -661,2251 +280,607 @@ class MySQLUploader:
|
|
|
661
280
|
'%Y/%m/%d',
|
|
662
281
|
'%Y%m%d',
|
|
663
282
|
'%Y-%m-%dT%H:%M:%S',
|
|
664
|
-
'%Y-%m-%d %H:%M:%S.%f',
|
|
665
|
-
'%Y/%-m/%-d', # 2023/1/8
|
|
666
|
-
'%Y-%-m-%-d', # 2023-01-8
|
|
667
283
|
]
|
|
284
|
+
|
|
668
285
|
for fmt in formats:
|
|
669
286
|
try:
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
return result
|
|
673
|
-
else:
|
|
674
|
-
result = datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d %H:%M:%S')
|
|
675
|
-
return result
|
|
287
|
+
dt = datetime.datetime.strptime(value_str, fmt)
|
|
288
|
+
return dt.strftime('%Y-%m-%d %H:%M:%S')
|
|
676
289
|
except ValueError:
|
|
677
290
|
continue
|
|
678
|
-
if not no_log:
|
|
679
|
-
logger.error('无效的日期格式', {'值': value})
|
|
680
|
-
raise ValueError(f"无效的日期格式: `{value}`")
|
|
681
|
-
|
|
682
|
-
def _get_fallback_value(self, column_type_lower: str, allow_null: bool, db_name: str = None, table_name: str = None, col_name: str = None, original_value: Any = None) -> Any:
|
|
683
|
-
"""
|
|
684
|
-
获取空值的兜底填充值
|
|
685
|
-
"""
|
|
686
|
-
# 兜底填充值映射
|
|
687
|
-
fallback_map = {
|
|
688
|
-
'int': 0,
|
|
689
|
-
'bigint': 0,
|
|
690
|
-
'tinyint': 0,
|
|
691
|
-
'smallint': 0,
|
|
692
|
-
'mediumint': 0,
|
|
693
|
-
'decimal': 0.0,
|
|
694
|
-
'float': 0.0,
|
|
695
|
-
'double': 0.0,
|
|
696
|
-
'date': '2000-01-01',
|
|
697
|
-
'datetime': '2000-01-01 00:00:00',
|
|
698
|
-
'timestamp': '2000-01-01 00:00:00',
|
|
699
|
-
'json': '{}',
|
|
700
|
-
'varchar': 'none',
|
|
701
|
-
'text': 'none',
|
|
702
|
-
'char': 'none',
|
|
703
|
-
'mediumtext': 'none',
|
|
704
|
-
'longtext': 'none',
|
|
705
|
-
'enum': None, # enum类型需要特殊处理,使用第一个可选值
|
|
706
|
-
'set': '', # set类型默认为空字符串
|
|
707
|
-
}
|
|
708
291
|
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
# "列": col_name,
|
|
727
|
-
# "值": original_value,
|
|
728
|
-
# "兜底值": fallback
|
|
729
|
-
# })
|
|
730
|
-
return fallback # 直接返回兜底值
|
|
731
|
-
|
|
732
|
-
return None # 允许空值时返回None
|
|
733
|
-
|
|
734
|
-
def _convert_to_int(self, value):
|
|
735
|
-
"""
|
|
736
|
-
尝试将value转换为int
|
|
737
|
-
"""
|
|
738
|
-
# 处理numpy/pandas标量
|
|
739
|
-
if hasattr(value, 'item') and callable(getattr(value, 'item', None)):
|
|
292
|
+
# 如果所有格式都无法解析,返回默认值而不是抛出异常
|
|
293
|
+
return '2000-01-01 00:00:00'
|
|
294
|
+
|
|
295
|
+
@staticmethod
|
|
296
|
+
def _convert_to_date(value: Any) -> str:
|
|
297
|
+
"""转换为date格式"""
|
|
298
|
+
if hasattr(value, 'strftime'):
|
|
299
|
+
return value.strftime('%Y-%m-%d')
|
|
300
|
+
|
|
301
|
+
# 先转为datetime再提取日期部分
|
|
302
|
+
datetime_str = DataValidator._convert_to_datetime(value)
|
|
303
|
+
return datetime_str.split(' ')[0]
|
|
304
|
+
|
|
305
|
+
@staticmethod
|
|
306
|
+
def _convert_to_int(value: Any) -> int:
|
|
307
|
+
"""转换为整数"""
|
|
308
|
+
if hasattr(value, 'item'):
|
|
740
309
|
try:
|
|
741
310
|
value = value.item()
|
|
742
311
|
except Exception:
|
|
743
312
|
pass
|
|
744
|
-
|
|
745
|
-
try:
|
|
746
|
-
extracted_value = value.value
|
|
747
|
-
if isinstance(extracted_value, (int, float, str)) and str(extracted_value).replace('.', '').replace('-', '').isdigit():
|
|
748
|
-
value = extracted_value
|
|
749
|
-
except Exception:
|
|
750
|
-
pass
|
|
313
|
+
|
|
751
314
|
try:
|
|
752
315
|
return int(value)
|
|
753
316
|
except (ValueError, TypeError):
|
|
754
317
|
try:
|
|
755
318
|
return int(float(value))
|
|
756
319
|
except (ValueError, TypeError):
|
|
757
|
-
raise
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
if hasattr(value, 'item') and callable(getattr(value, 'item', None)):
|
|
320
|
+
raise ValueError(f"无法转换为整数: {value}")
|
|
321
|
+
|
|
322
|
+
@staticmethod
|
|
323
|
+
def _convert_to_decimal(value: Any) -> Decimal:
|
|
324
|
+
"""转换为Decimal"""
|
|
325
|
+
if hasattr(value, 'item'):
|
|
764
326
|
try:
|
|
765
327
|
value = value.item()
|
|
766
328
|
except Exception:
|
|
767
329
|
pass
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
330
|
+
|
|
331
|
+
# 处理百分比字符串
|
|
332
|
+
if isinstance(value, str) and '%' in value:
|
|
333
|
+
if re.match(r'^-?\d+(\.\d+)?%$', value.strip()):
|
|
334
|
+
value = float(value.strip().replace('%', '')) / 100
|
|
335
|
+
|
|
336
|
+
try:
|
|
337
|
+
return Decimal(str(value))
|
|
338
|
+
except (ValueError, TypeError, InvalidOperation):
|
|
339
|
+
raise ValueError(f"无法转换为数值: {value}")
|
|
776
340
|
|
|
777
|
-
def _convert_to_decimal(self, value):
|
|
778
|
-
"""
|
|
779
|
-
尝试将value转换为Decimal,兼容常见数值类型。
|
|
780
|
-
"""
|
|
781
|
-
if hasattr(value, 'item') and callable(getattr(value, 'item', None)):
|
|
782
|
-
try:
|
|
783
|
-
value = value.item()
|
|
784
|
-
except Exception:
|
|
785
|
-
pass
|
|
786
|
-
elif hasattr(value, 'value') and not isinstance(value, str):
|
|
787
|
-
try:
|
|
788
|
-
extracted_value = value.value
|
|
789
|
-
if isinstance(extracted_value, (int, float, str)) and str(extracted_value).replace('.', '').replace('-', '').replace('e', '').replace('E', '').isdigit():
|
|
790
|
-
value = extracted_value
|
|
791
|
-
except Exception:
|
|
792
|
-
pass
|
|
793
|
-
return Decimal(str(value))
|
|
794
341
|
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
column_type_lower = column_type.lower() if column_type else ''
|
|
806
|
-
|
|
807
|
-
# 对于包含CURRENT_TIMESTAMP的TIMESTAMP字段,跳过验证,让MySQL自动处理
|
|
808
|
-
if ('timestamp' in column_type_lower and 'current_timestamp' in column_type_lower and
|
|
809
|
-
col_name in ['创建时间', '更新时间']):
|
|
810
|
-
# 这些字段由MySQL自动处理,不需要传入值
|
|
811
|
-
return None
|
|
812
|
-
|
|
813
|
-
# 统一的空值检查(None、空字符串、NaN)
|
|
814
|
-
is_empty_value = False
|
|
815
|
-
if value is None:
|
|
816
|
-
is_empty_value = True
|
|
817
|
-
elif value == '':
|
|
818
|
-
# 空字符串对于字符串类型是有效值
|
|
819
|
-
if any(t in column_type_lower for t in ['varchar', 'text', 'char', 'mediumtext', 'longtext']):
|
|
820
|
-
return ""
|
|
821
|
-
is_empty_value = True
|
|
822
|
-
else:
|
|
823
|
-
# 检查NaN值(避免对列表和字典使用pd.isna)
|
|
824
|
-
if not isinstance(value, (list, dict)):
|
|
825
|
-
try:
|
|
826
|
-
is_empty_value = pd.isna(value) or (isinstance(value, (float, Decimal)) and math.isinf(value))
|
|
827
|
-
except (ValueError, TypeError):
|
|
828
|
-
is_empty_value = False
|
|
829
|
-
|
|
830
|
-
# 统一处理空值
|
|
831
|
-
if is_empty_value:
|
|
832
|
-
fallback_value = self._get_fallback_value(column_type_lower, allow_null, db_name, table_name, col_name, value)
|
|
833
|
-
# 如果返回了兜底值(非None),直接返回,不再进行后续验证
|
|
834
|
-
# 因为兜底值已经是根据列类型设计的合适值
|
|
835
|
-
if fallback_value is not None:
|
|
836
|
-
return fallback_value
|
|
837
|
-
# 如果返回None(允许空值的情况),继续后续处理
|
|
838
|
-
return None
|
|
839
|
-
|
|
840
|
-
# JSON类型验证和转换
|
|
841
|
-
if 'json' in column_type_lower:
|
|
842
|
-
if isinstance(value, (dict, list)):
|
|
843
|
-
try:
|
|
844
|
-
return json.dumps(value, ensure_ascii=False)
|
|
845
|
-
except (TypeError, ValueError) as e:
|
|
846
|
-
logger.error(f"JSON序列化失败: {e}", {"库": db_name, "表": table_name, "列": col_name, "值": value})
|
|
847
|
-
raise ValueError(f"JSON序列化失败: {e}")
|
|
848
|
-
elif isinstance(value, str):
|
|
849
|
-
# 验证字符串是否为有效的JSON
|
|
850
|
-
try:
|
|
851
|
-
json.loads(value)
|
|
852
|
-
return value
|
|
853
|
-
except (TypeError, ValueError) as e:
|
|
854
|
-
logger.error(f"无效的JSON字符串: {e}", {"库": db_name, "表": table_name, "列": col_name, "值": value})
|
|
855
|
-
raise ValueError(f"无效的JSON字符串: {e}")
|
|
856
|
-
else:
|
|
857
|
-
# 其他类型转换为字符串
|
|
858
|
-
return str(value)
|
|
859
|
-
|
|
860
|
-
original_value = value
|
|
861
|
-
|
|
862
|
-
# 日期时间类型验证
|
|
863
|
-
if 'datetime' in column_type_lower or 'timestamp' in column_type_lower:
|
|
864
|
-
return self._validate_datetime(value, date_type=False, no_log=True)
|
|
865
|
-
elif 'date' in column_type_lower:
|
|
866
|
-
return self._validate_datetime(value, date_type=True, no_log=True)
|
|
867
|
-
# 数值类型验证
|
|
868
|
-
elif 'int' in column_type_lower:
|
|
869
|
-
try:
|
|
870
|
-
return self._convert_to_int(value)
|
|
871
|
-
except (ValueError, TypeError):
|
|
872
|
-
logger.error(f"值 `{value}` 无法转换为整数", {"库": db_name, "表": table_name, "列": col_name})
|
|
873
|
-
raise ValueError(f"值 `{value}` 无法转换为整数")
|
|
874
|
-
elif any(t in column_type_lower for t in ['decimal', 'float', 'double']):
|
|
875
|
-
# 百分比字符串处理
|
|
876
|
-
if isinstance(value, str) and '%' in value:
|
|
877
|
-
try:
|
|
878
|
-
if re.match(r'^-?\d+(\.\d+)?%$', value.strip()):
|
|
879
|
-
value = float(value.strip().replace('%', '')) / 100
|
|
880
|
-
else:
|
|
881
|
-
logger.warning("百分比字符串不符合格式,跳过转换", {"库": db_name, "表": table_name, "列": col_name, "原始": original_value})
|
|
882
|
-
value = original_value
|
|
883
|
-
except (ValueError, TypeError):
|
|
884
|
-
logger.warning("百分比字符串转换失败,保留原始值", {"库": db_name, "表": table_name, "列": col_name, "原始": original_value})
|
|
885
|
-
value = original_value
|
|
886
|
-
try:
|
|
887
|
-
if 'decimal' in column_type_lower:
|
|
888
|
-
precision, scale = self._get_decimal_scale(column_type)
|
|
889
|
-
value_decimal = self._convert_to_decimal(value)
|
|
890
|
-
if len(value_decimal.as_tuple().digits) - abs(value_decimal.as_tuple().exponent) > precision - scale:
|
|
891
|
-
raise ValueError(f"整数部分超出范围")
|
|
892
|
-
return value_decimal
|
|
893
|
-
else: # float/double
|
|
894
|
-
return self._convert_to_float(value)
|
|
895
|
-
except (ValueError, TypeError, InvalidOperation) as e:
|
|
896
|
-
logger.error(f"值 `{value}` 无法转换为数值类型: {e}", {"库": db_name, "表": table_name, "列": col_name})
|
|
897
|
-
raise ValueError(f"值 `{value}` 无法转换为数值类型: {e}")
|
|
898
|
-
# ENUM类型验证
|
|
899
|
-
elif 'enum' in column_type_lower:
|
|
900
|
-
# 提取enum的可选值,支持单引号和双引号
|
|
901
|
-
enum_values = re.findall(r"['\"]([^'\"]*)['\"]", column_type)
|
|
902
|
-
str_value = str(value).strip()
|
|
903
|
-
if str_value not in enum_values:
|
|
904
|
-
logger.error(f"值 `{str_value}` 不在enum允许的值中: {enum_values}",
|
|
905
|
-
{"库": db_name, "表": table_name, "列": col_name, "列类型": column_type})
|
|
906
|
-
raise ValueError(f"值 `{str_value}` 不在enum允许的值中: {enum_values}")
|
|
907
|
-
return str_value
|
|
908
|
-
# SET类型验证
|
|
909
|
-
elif 'set' in column_type_lower:
|
|
910
|
-
# 提取set的可选值,支持单引号和双引号
|
|
911
|
-
set_values = re.findall(r"['\"]([^'\"]*)['\"]", column_type)
|
|
912
|
-
str_value = str(value).strip()
|
|
913
|
-
# SET类型可以是多个值的组合,用逗号分隔
|
|
914
|
-
if ',' in str_value:
|
|
915
|
-
input_values = [v.strip() for v in str_value.split(',')]
|
|
916
|
-
else:
|
|
917
|
-
input_values = [str_value]
|
|
918
|
-
|
|
919
|
-
for val in input_values:
|
|
920
|
-
if val and val not in set_values:
|
|
921
|
-
logger.error(f"值 `{val}` 不在set允许的值中: {set_values}",
|
|
922
|
-
{"库": db_name, "表": table_name, "列": col_name, "列类型": column_type})
|
|
923
|
-
raise ValueError(f"值 `{val}` 不在set允许的值中: {set_values}")
|
|
924
|
-
return str_value
|
|
925
|
-
# 字符串类型验证
|
|
926
|
-
elif 'varchar' in column_type_lower:
|
|
927
|
-
str_value = str(value)
|
|
928
|
-
try:
|
|
929
|
-
max_len = int(re.search(r'\((\d+)\)', column_type).group(1))
|
|
930
|
-
if len(str_value.encode('utf-8')) > max_len:
|
|
931
|
-
logger.warning(f"列`{col_name}`的值`{str_value}`长度({len(str_value.encode('utf-8'))})超出varchar({max_len})限制,将进行截断", {"库": db_name, "表": table_name})
|
|
932
|
-
return self._truncate_str(str_value, max_len)
|
|
933
|
-
except (AttributeError, IndexError):
|
|
934
|
-
pass
|
|
935
|
-
return str_value
|
|
342
|
+
class TableManager:
|
|
343
|
+
"""表管理器"""
|
|
344
|
+
|
|
345
|
+
def __init__(self, connection_manager: DatabaseConnectionManager, collation: str):
|
|
346
|
+
self.conn_mgr = connection_manager
|
|
347
|
+
self.collation = collation
|
|
348
|
+
|
|
349
|
+
def ensure_database_exists(self, db_name: str):
|
|
350
|
+
"""确保数据库存在"""
|
|
351
|
+
db_name = self._sanitize_identifier(db_name)
|
|
936
352
|
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
353
|
+
with self.conn_mgr.get_connection() as conn:
|
|
354
|
+
with conn.cursor() as cursor:
|
|
355
|
+
cursor.execute(
|
|
356
|
+
"SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s",
|
|
357
|
+
(db_name,)
|
|
358
|
+
)
|
|
359
|
+
if not cursor.fetchone():
|
|
360
|
+
charset = self.conn_mgr.config['charset']
|
|
361
|
+
sql = f"CREATE DATABASE `{db_name}` CHARACTER SET {charset} COLLATE {self.collation}"
|
|
362
|
+
cursor.execute(sql)
|
|
363
|
+
conn.commit()
|
|
364
|
+
logger.debug('数据库已创建', {'database': db_name})
|
|
365
|
+
|
|
366
|
+
def table_exists(self, db_name: str, table_name: str) -> bool:
|
|
367
|
+
"""检查表是否存在"""
|
|
368
|
+
db_name = self._sanitize_identifier(db_name)
|
|
369
|
+
table_name = self._sanitize_identifier(table_name)
|
|
370
|
+
|
|
371
|
+
with self.conn_mgr.get_connection() as conn:
|
|
372
|
+
with conn.cursor() as cursor:
|
|
373
|
+
cursor.execute(
|
|
374
|
+
"SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s",
|
|
375
|
+
(db_name, table_name)
|
|
376
|
+
)
|
|
377
|
+
return bool(cursor.fetchone())
|
|
378
|
+
|
|
379
|
+
def create_table(self, db_name: str, table_name: str, columns: Dict[str, str],
|
|
380
|
+
primary_keys: Optional[List[str]] = None,
|
|
381
|
+
unique_keys: Optional[List[List[str]]] = None):
|
|
382
|
+
"""创建表"""
|
|
383
|
+
db_name = self._sanitize_identifier(db_name)
|
|
384
|
+
table_name = self._sanitize_identifier(table_name)
|
|
385
|
+
|
|
386
|
+
# 构建列定义
|
|
387
|
+
column_defs = []
|
|
388
|
+
|
|
389
|
+
# 始终添加自增ID列作为主键
|
|
390
|
+
column_defs.append("`id` BIGINT NOT NULL AUTO_INCREMENT")
|
|
391
|
+
|
|
392
|
+
# 添加业务列
|
|
393
|
+
for col_name, col_type in columns.items():
|
|
394
|
+
if col_name.lower() in ['id', 'create_at', 'update_at']:
|
|
395
|
+
continue
|
|
396
|
+
safe_col_name = self._sanitize_identifier(col_name)
|
|
397
|
+
column_defs.append(f"`{safe_col_name}` {col_type} NOT NULL")
|
|
398
|
+
|
|
399
|
+
# 添加时间戳列
|
|
400
|
+
column_defs.append("`create_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP")
|
|
401
|
+
column_defs.append("`update_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")
|
|
402
|
+
|
|
403
|
+
# 主键定义(始终使用id作为主键)
|
|
404
|
+
primary_key_def = "PRIMARY KEY (`id`)"
|
|
405
|
+
|
|
406
|
+
# 唯一约束定义
|
|
407
|
+
unique_defs = []
|
|
408
|
+
if unique_keys:
|
|
409
|
+
for i, uk in enumerate(unique_keys):
|
|
410
|
+
# 过滤掉系统列
|
|
411
|
+
filtered_uk = [col for col in uk if col.lower() not in ['id', 'create_at', 'update_at']]
|
|
412
|
+
if filtered_uk:
|
|
413
|
+
safe_uk = [f"`{self._sanitize_identifier(col)}`" for col in filtered_uk]
|
|
414
|
+
unique_name = f"uniq_{i}"
|
|
415
|
+
unique_defs.append(f"UNIQUE KEY `{unique_name}` ({','.join(safe_uk)})")
|
|
416
|
+
|
|
417
|
+
# 组合所有定义
|
|
418
|
+
all_defs = column_defs + [primary_key_def] + unique_defs
|
|
419
|
+
|
|
420
|
+
charset = self.conn_mgr.config['charset']
|
|
421
|
+
sql = f"""
|
|
422
|
+
CREATE TABLE `{db_name}`.`{table_name}` (
|
|
423
|
+
{','.join(all_defs)}
|
|
424
|
+
) ENGINE=InnoDB DEFAULT CHARSET={charset} COLLATE={self.collation}
|
|
981
425
|
"""
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
sql_check = '''
|
|
987
|
-
SELECT COUNT(1) FROM INFORMATION_SCHEMA.STATISTICS
|
|
988
|
-
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND COLUMN_NAME = %s
|
|
989
|
-
'''
|
|
990
|
-
sql_create = f'ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{column}` (`{column}`)'
|
|
991
|
-
try:
|
|
992
|
-
with self._get_connection() as conn:
|
|
993
|
-
with conn.cursor() as cursor:
|
|
994
|
-
cursor.execute(sql_check, (db_name, table_name, column))
|
|
995
|
-
exists = cursor.fetchone()
|
|
996
|
-
if exists and list(exists.values())[0] > 0:
|
|
997
|
-
logger.debug('索引检查', {'库': db_name, '表': table_name, '索引列': column})
|
|
998
|
-
return
|
|
999
|
-
cursor.execute(sql_create)
|
|
426
|
+
|
|
427
|
+
with self.conn_mgr.get_connection() as conn:
|
|
428
|
+
with conn.cursor() as cursor:
|
|
429
|
+
cursor.execute(sql)
|
|
1000
430
|
conn.commit()
|
|
1001
|
-
logger.debug('
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
def _get_existing_unique_keys(self, db_name: str, table_name: str) -> List[List[str]]:
|
|
1007
|
-
"""
|
|
1008
|
-
获取表中所有UNIQUE KEY的列组合(不含主键)。
|
|
1009
|
-
返回:[[col1, col2], ...]
|
|
1010
|
-
"""
|
|
1011
|
-
db_name = self._validate_identifier(db_name, is_database=True)
|
|
1012
|
-
table_name = self._validate_identifier(table_name)
|
|
1013
|
-
sql = '''
|
|
1014
|
-
SELECT INDEX_NAME, COLUMN_NAME
|
|
1015
|
-
FROM INFORMATION_SCHEMA.STATISTICS
|
|
1016
|
-
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND NON_UNIQUE = 0 AND INDEX_NAME != 'PRIMARY'
|
|
1017
|
-
ORDER BY INDEX_NAME, SEQ_IN_INDEX
|
|
1018
|
-
'''
|
|
1019
|
-
unique_map = {}
|
|
1020
|
-
try:
|
|
1021
|
-
with self._get_connection() as conn:
|
|
1022
|
-
with conn.cursor() as cursor:
|
|
1023
|
-
cursor.execute(sql, (db_name, table_name))
|
|
1024
|
-
for row in cursor.fetchall():
|
|
1025
|
-
idx = row['INDEX_NAME']
|
|
1026
|
-
col = row['COLUMN_NAME']
|
|
1027
|
-
unique_map.setdefault(idx, []).append(col)
|
|
1028
|
-
except Exception as e:
|
|
1029
|
-
logger.warning('获取UNIQUE KEY信息失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
|
1030
|
-
# 只返回列名组合,全部清洗小写
|
|
1031
|
-
return [[self._normalize_col(c) for c in cols] for cols in unique_map.values() if cols]
|
|
1032
|
-
|
|
1033
|
-
def _add_unique_key(self, db_name: str, table_name: str, unique_cols: List[str]):
|
|
1034
|
-
"""
|
|
1035
|
-
添加UNIQUE KEY
|
|
1036
|
-
"""
|
|
1037
|
-
safe_cols = [self._normalize_col(col) for col in unique_cols]
|
|
1038
|
-
unique_name = f"uniq_{'_'.join(safe_cols)}"
|
|
1039
|
-
sql = f'ALTER TABLE `{db_name}`.`{table_name}` ADD UNIQUE KEY `{unique_name}` ({','.join(f'`{col}`' for col in safe_cols)})'
|
|
431
|
+
logger.debug('表已创建', {'database': db_name, 'table': table_name})
|
|
432
|
+
|
|
433
|
+
def get_partition_table_name(self, base_name: str, date_value: str, partition_by: str) -> str:
|
|
434
|
+
"""获取分表名称"""
|
|
1040
435
|
try:
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
cursor.execute(sql)
|
|
1044
|
-
conn.commit()
|
|
1045
|
-
logger.debug('添加唯一约束列成功', {'库': db_name, '表': table_name, '列': unique_cols})
|
|
1046
|
-
except Exception as e:
|
|
1047
|
-
logger.warning('唯一约束列添加失败', {'库': db_name, '表': table_name, '列': unique_cols, '错误': str(e)})
|
|
1048
|
-
|
|
1049
|
-
def _upload_to_table(
|
|
1050
|
-
self,
|
|
1051
|
-
db_name: str,
|
|
1052
|
-
table_name: str,
|
|
1053
|
-
data: List[Dict],
|
|
1054
|
-
set_typ: Dict[str, str],
|
|
1055
|
-
primary_keys: Optional[List[str]],
|
|
1056
|
-
check_duplicate: bool,
|
|
1057
|
-
duplicate_columns: Optional[List[str]],
|
|
1058
|
-
allow_null: bool,
|
|
1059
|
-
auto_create: bool,
|
|
1060
|
-
date_column: Optional[str],
|
|
1061
|
-
indexes: Optional[List[str]],
|
|
1062
|
-
batch_id: Optional[str] = None,
|
|
1063
|
-
update_on_duplicate: bool = False,
|
|
1064
|
-
transaction_mode: str = "batch",
|
|
1065
|
-
unique_keys: Optional[List[List[str]]] = None
|
|
1066
|
-
):
|
|
1067
|
-
"""实际执行表上传的方法"""
|
|
1068
|
-
table_existed = self._check_table_exists(db_name, table_name)
|
|
1069
|
-
if not table_existed:
|
|
1070
|
-
if auto_create:
|
|
1071
|
-
self._create_table(db_name, table_name, set_typ, primary_keys, date_column, indexes,
|
|
1072
|
-
allow_null=allow_null, unique_keys=unique_keys)
|
|
1073
|
-
else:
|
|
1074
|
-
logger.error('数据表不存在', {
|
|
1075
|
-
'库': db_name,
|
|
1076
|
-
'表': table_name,
|
|
1077
|
-
})
|
|
1078
|
-
raise ValueError(f"数据表不存在: `{db_name}`.`{table_name}`")
|
|
1079
|
-
if table_existed and unique_keys:
|
|
1080
|
-
try:
|
|
1081
|
-
exist_ukeys = self._get_existing_unique_keys(db_name, table_name)
|
|
1082
|
-
exist_ukeys_norm = [sorted([c.lower() for c in uk]) for uk in exist_ukeys]
|
|
1083
|
-
filtered_ukeys = [uk for uk in unique_keys if 1 <= len(uk) <= 20]
|
|
1084
|
-
to_add = []
|
|
1085
|
-
for uk in filtered_ukeys:
|
|
1086
|
-
norm_uk = sorted([c.lower() for c in uk])
|
|
1087
|
-
if norm_uk not in exist_ukeys_norm:
|
|
1088
|
-
to_add.append(uk)
|
|
1089
|
-
max_unique_keys = 10
|
|
1090
|
-
if len(exist_ukeys) + len(to_add) > max_unique_keys:
|
|
1091
|
-
logger.warning('unique_keys超限', {
|
|
1092
|
-
'库': db_name,
|
|
1093
|
-
'表': table_name,
|
|
1094
|
-
'已存在': exist_ukeys,
|
|
1095
|
-
'本次待添加': to_add,
|
|
1096
|
-
'最大数量': max_unique_keys
|
|
1097
|
-
})
|
|
1098
|
-
to_add = to_add[:max_unique_keys - len(exist_ukeys)]
|
|
1099
|
-
for uk in to_add:
|
|
1100
|
-
self._add_unique_key(db_name, table_name, uk)
|
|
1101
|
-
except Exception as e:
|
|
1102
|
-
logger.warning('动态unique key处理异常', {'库': db_name, '表': table_name, '错误': str(e)})
|
|
1103
|
-
table_columns = self._get_table_columns(db_name, table_name)
|
|
1104
|
-
if not table_columns:
|
|
1105
|
-
logger.error('获取列失败', {
|
|
1106
|
-
'库': db_name,
|
|
1107
|
-
'表': table_name,
|
|
1108
|
-
'列': self._shorten_for_log(table_columns),
|
|
1109
|
-
})
|
|
1110
|
-
raise ValueError(f"获取列失败 `{db_name}`.`{table_name}`")
|
|
1111
|
-
# 检查并自动添加缺失的列
|
|
1112
|
-
missing_columns = [col for col in set_typ if col not in table_columns]
|
|
1113
|
-
if missing_columns:
|
|
1114
|
-
if not self.auto_creat_missing_cols:
|
|
1115
|
-
logger.error('列不存在且不支持自动添加,请手动维护表结构,并补齐缺失列', {
|
|
1116
|
-
'库': db_name,
|
|
1117
|
-
'表': table_name,
|
|
1118
|
-
'缺失列数': len(missing_columns),
|
|
1119
|
-
'缺失列': missing_columns,
|
|
1120
|
-
})
|
|
1121
|
-
raise ValueError(f"列不存在: `{missing_columns}` -> `{db_name}`.`{table_name}`")
|
|
436
|
+
if isinstance(date_value, str):
|
|
437
|
+
date_obj = pd.to_datetime(date_value)
|
|
1122
438
|
else:
|
|
1123
|
-
|
|
1124
|
-
# 自动添加缺失的列
|
|
1125
|
-
for col in missing_columns:
|
|
1126
|
-
try:
|
|
1127
|
-
self._add_column_to_table(db_name, table_name, col, set_typ[col], allow_null)
|
|
1128
|
-
logger.info('自动添加缺失列', {
|
|
1129
|
-
'库': db_name,
|
|
1130
|
-
'表': table_name,
|
|
1131
|
-
'列': col,
|
|
1132
|
-
'类型': set_typ[col]
|
|
1133
|
-
})
|
|
1134
|
-
except Exception as e:
|
|
1135
|
-
logger.error('添加列失败', {
|
|
1136
|
-
'库': db_name,
|
|
1137
|
-
'表': table_name,
|
|
1138
|
-
'列': col,
|
|
1139
|
-
'类型': set_typ[col],
|
|
1140
|
-
'错误': str(e)
|
|
1141
|
-
})
|
|
1142
|
-
raise ValueError(f"添加列失败: `{col}` -> `{db_name}`.`{table_name}`: {str(e)}")
|
|
1143
|
-
|
|
1144
|
-
# 重新获取表列信息
|
|
1145
|
-
table_columns = self._get_table_columns(db_name, table_name)
|
|
1146
|
-
if date_column and date_column in table_columns:
|
|
1147
|
-
try:
|
|
1148
|
-
self._ensure_index(db_name, table_name, date_column)
|
|
1149
|
-
except Exception as e:
|
|
1150
|
-
logger.warning('分表参考字段索引创建失败', {'库': db_name, '表': table_name, '列': date_column, '错误': str(e)})
|
|
1151
|
-
inserted, skipped, failed = self._insert_data(
|
|
1152
|
-
db_name, table_name, data, set_typ,
|
|
1153
|
-
check_duplicate, duplicate_columns,
|
|
1154
|
-
batch_id=batch_id,
|
|
1155
|
-
update_on_duplicate=update_on_duplicate,
|
|
1156
|
-
transaction_mode=transaction_mode
|
|
1157
|
-
)
|
|
1158
|
-
return inserted, skipped, failed
|
|
1159
|
-
|
|
1160
|
-
def _infer_data_type(self, value: Any, no_log: bool = False) -> str:
|
|
1161
|
-
"""
|
|
1162
|
-
根据值推断合适的MySQL数据类型
|
|
1163
|
-
|
|
1164
|
-
:param value: 要推断的值
|
|
1165
|
-
:param no_log: 记录日志,默认为False
|
|
1166
|
-
:return: MySQL数据类型字符串
|
|
1167
|
-
"""
|
|
1168
|
-
if value is None or str(value).lower() in ['', 'none', 'nan']:
|
|
1169
|
-
return 'VARCHAR(255)' # 默认字符串类型
|
|
1170
|
-
|
|
1171
|
-
# 检查是否是百分比字符串
|
|
1172
|
-
if isinstance(value, str):
|
|
1173
|
-
if '%' in value:
|
|
1174
|
-
if re.match(r'^-?\d+(\.\d+)?%$', value.strip()):
|
|
1175
|
-
return 'DECIMAL(10, 4)' # 百分比转为小数,使用DECIMAL
|
|
1176
|
-
else:
|
|
1177
|
-
return 'VARCHAR(255)' # 不符合格式的百分比,视为字符串
|
|
1178
|
-
|
|
1179
|
-
if isinstance(value, bool):
|
|
1180
|
-
return 'TINYINT(1)'
|
|
1181
|
-
elif isinstance(value, int):
|
|
1182
|
-
# if -128 <= value <= 127:
|
|
1183
|
-
# return 'TINYINT'
|
|
1184
|
-
# elif -32768 <= value <= 32767:
|
|
1185
|
-
# return 'SMALLINT'
|
|
1186
|
-
# elif -8388608 <= value <= 8388607:
|
|
1187
|
-
# return 'MEDIUMINT'
|
|
1188
|
-
if -2147483648 <= value <= 2147483647:
|
|
1189
|
-
return 'INT'
|
|
1190
|
-
else:
|
|
1191
|
-
return 'BIGINT'
|
|
1192
|
-
elif isinstance(value, float):
|
|
1193
|
-
# 计算小数位数
|
|
1194
|
-
num_str = str(value)
|
|
1195
|
-
_, decimal_places = count_decimal_places(num_str)
|
|
1196
|
-
return f'DECIMAL(20,{min(decimal_places, 6)})' # 限制最大6位小数
|
|
1197
|
-
elif isinstance(value, (datetime.datetime, pd.Timestamp)):
|
|
1198
|
-
return 'DATETIME'
|
|
1199
|
-
elif isinstance(value, datetime.date):
|
|
1200
|
-
return 'DATE'
|
|
1201
|
-
elif isinstance(value, (list, dict)):
|
|
1202
|
-
return 'JSON'
|
|
1203
|
-
elif isinstance(value, str):
|
|
1204
|
-
# 尝试判断是否是日期时间
|
|
1205
|
-
try:
|
|
1206
|
-
self._validate_datetime(value=value, date_type=False, no_log=no_log)
|
|
1207
|
-
return 'DATETIME'
|
|
1208
|
-
except ValueError:
|
|
1209
|
-
pass
|
|
1210
|
-
|
|
1211
|
-
# 根据字符串长度选择合适类型
|
|
1212
|
-
length = len(value)
|
|
1213
|
-
if length <= 255:
|
|
1214
|
-
return 'VARCHAR(255)'
|
|
1215
|
-
elif length <= 65535:
|
|
1216
|
-
return 'TEXT'
|
|
1217
|
-
elif length <= 16777215:
|
|
1218
|
-
return 'MEDIUMTEXT'
|
|
1219
|
-
else:
|
|
1220
|
-
return 'LONGTEXT'
|
|
439
|
+
date_obj = date_value
|
|
1221
440
|
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
"""
|
|
1227
|
-
1. pandas:规范化列名
|
|
1228
|
-
2. 字典列表:规范化每个字典的键
|
|
1229
|
-
"""
|
|
1230
|
-
if isinstance(data, pd.DataFrame):
|
|
1231
|
-
if self.case_sensitive:
|
|
1232
|
-
data.columns = [self._validate_identifier(col) for col in data.columns]
|
|
1233
|
-
else:
|
|
1234
|
-
data.columns = [self._validate_identifier(col).lower() for col in data.columns]
|
|
1235
|
-
return data
|
|
1236
|
-
elif isinstance(data, list):
|
|
1237
|
-
if self.case_sensitive:
|
|
1238
|
-
return [{self._validate_identifier(k): v for k, v in item.items()} for item in data]
|
|
441
|
+
if partition_by == 'year':
|
|
442
|
+
return f"{base_name}_{date_obj.year}"
|
|
443
|
+
elif partition_by == 'month':
|
|
444
|
+
return f"{base_name}_{date_obj.year}_{date_obj.month:02d}"
|
|
1239
445
|
else:
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
# 处理自动时间戳功能
|
|
1257
|
-
if auto_timestamps:
|
|
1258
|
-
data, set_typ = self._process_auto_timestamps(data, set_typ, db_name, table_name)
|
|
446
|
+
raise ValueError("partition_by必须是'year'或'month'")
|
|
447
|
+
except Exception as e:
|
|
448
|
+
raise ValueError(f"无效的日期值: {date_value}, 错误: {str(e)}")
|
|
449
|
+
|
|
450
|
+
@staticmethod
|
|
451
|
+
def _sanitize_identifier(identifier: str) -> str:
|
|
452
|
+
"""清理标识符"""
|
|
453
|
+
if not identifier or not isinstance(identifier, str):
|
|
454
|
+
raise ValueError(f"无效的标识符: {identifier}")
|
|
455
|
+
|
|
456
|
+
# 清理特殊字符
|
|
457
|
+
cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '_', identifier)
|
|
458
|
+
cleaned = re.sub(r'_+', '_', cleaned).strip('_')
|
|
459
|
+
|
|
460
|
+
if not cleaned:
|
|
461
|
+
raise ValueError(f"标识符清理后为空: {identifier}")
|
|
1259
462
|
|
|
1260
|
-
#
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
463
|
+
# 检查MySQL关键字
|
|
464
|
+
mysql_keywords = {
|
|
465
|
+
'select', 'insert', 'update', 'delete', 'from', 'where', 'and', 'or',
|
|
466
|
+
'not', 'like', 'in', 'is', 'null', 'true', 'false', 'between'
|
|
467
|
+
}
|
|
1264
468
|
|
|
1265
|
-
|
|
469
|
+
if len(cleaned) > 64:
|
|
470
|
+
cleaned = cleaned[:64]
|
|
471
|
+
|
|
472
|
+
if cleaned.lower() in mysql_keywords:
|
|
473
|
+
return f"`{cleaned}`"
|
|
474
|
+
|
|
475
|
+
return cleaned
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
class DataProcessor:
|
|
479
|
+
"""数据处理器"""
|
|
480
|
+
|
|
481
|
+
@staticmethod
|
|
482
|
+
def normalize_data(data: Union[Dict, List[Dict], pd.DataFrame]) -> List[Dict]:
|
|
483
|
+
"""标准化数据格式为字典列表"""
|
|
1266
484
|
if isinstance(data, pd.DataFrame):
|
|
1267
|
-
|
|
1268
|
-
if self.case_sensitive:
|
|
1269
|
-
data.columns = [self._validate_identifier(col) for col in data.columns]
|
|
1270
|
-
else:
|
|
1271
|
-
data.columns = [self._validate_identifier(col).lower() for col in data.columns]
|
|
1272
|
-
data = data.replace({pd.NA: None}).to_dict('records')
|
|
1273
|
-
except Exception as e:
|
|
1274
|
-
logger.error('DataFrame处理时发生错误', {
|
|
1275
|
-
'error': str(e),
|
|
1276
|
-
'data': self._shorten_for_log(data),
|
|
1277
|
-
})
|
|
1278
|
-
raise ValueError(f"DataFrame处理时发生错误: {e}")
|
|
485
|
+
return data.to_dict('records')
|
|
1279
486
|
elif isinstance(data, dict):
|
|
1280
|
-
|
|
1281
|
-
data = [{k: v for k, v in data.items()}]
|
|
1282
|
-
else:
|
|
1283
|
-
data = [{k.lower(): v for k, v in data.items()}]
|
|
487
|
+
return [data]
|
|
1284
488
|
elif isinstance(data, list) and all(isinstance(item, dict) for item in data):
|
|
1285
|
-
|
|
1286
|
-
data = [{k: v for k, v in item.items()} for item in data]
|
|
1287
|
-
else:
|
|
1288
|
-
data = [{k.lower(): v for k, v in item.items()} for item in data]
|
|
1289
|
-
else:
|
|
1290
|
-
logger.error('数据结构必须是字典、列表、字典列表或dataframe', {
|
|
1291
|
-
'data': self._shorten_for_log(data),
|
|
1292
|
-
})
|
|
1293
|
-
raise ValueError("数据结构必须是字典、列表、字典列表或dataframe")
|
|
1294
|
-
|
|
1295
|
-
# 统一处理原始数据中列名的特殊字符
|
|
1296
|
-
data = self.normalize_column_names(data)
|
|
1297
|
-
|
|
1298
|
-
if not normalized_set_typ:
|
|
1299
|
-
logger.warning('set_typ为空, 将自动推断数据类型, 可能存在数据类型识别错误')
|
|
1300
|
-
|
|
1301
|
-
# 根据set_typ处理所有数据的列:严格按set_typ定义的列进行过滤
|
|
1302
|
-
filtered_set_typ = {}
|
|
1303
|
-
data_columns = list(data[0].keys()) if data and len(data) > 0 else []
|
|
1304
|
-
|
|
1305
|
-
if normalized_set_typ:
|
|
1306
|
-
# 严格按照set_typ定义的列进行过滤,排除id列
|
|
1307
|
-
for col in normalized_set_typ:
|
|
1308
|
-
if (self.case_sensitive and col == 'id') or (not self.case_sensitive and col.lower() == 'id'):
|
|
1309
|
-
continue
|
|
1310
|
-
filtered_set_typ[col] = normalized_set_typ[col]
|
|
1311
|
-
|
|
1312
|
-
# 对所有数据行进行列处理:补齐缺失列,丢弃多余列
|
|
1313
|
-
processed_data = []
|
|
1314
|
-
for row in data:
|
|
1315
|
-
processed_row = {}
|
|
1316
|
-
# 只保留set_typ中定义的列
|
|
1317
|
-
for col in filtered_set_typ:
|
|
1318
|
-
if col in row:
|
|
1319
|
-
processed_row[col] = row[col]
|
|
1320
|
-
else:
|
|
1321
|
-
processed_row[col] = None # 缺失列用None填充
|
|
1322
|
-
processed_data.append(processed_row)
|
|
1323
|
-
data = processed_data
|
|
1324
|
-
|
|
1325
|
-
# 检查是否有丢弃的列
|
|
1326
|
-
dropped_columns = [col for col in data_columns if col not in filtered_set_typ]
|
|
1327
|
-
if dropped_columns:
|
|
1328
|
-
logger.warning('数据中存在set_typ未定义的列并已被丢弃', {
|
|
1329
|
-
'库': db_name,
|
|
1330
|
-
'表': table_name,
|
|
1331
|
-
'丢弃列': dropped_columns,
|
|
1332
|
-
# '保留列': list(filtered_set_typ.keys())
|
|
1333
|
-
})
|
|
1334
|
-
|
|
1335
|
-
logger.debug('数据列处理完成', {
|
|
1336
|
-
'库': db_name,
|
|
1337
|
-
'表': table_name,
|
|
1338
|
-
'原始列': data_columns,
|
|
1339
|
-
'目标列': list(filtered_set_typ.keys()),
|
|
1340
|
-
'丢弃列': dropped_columns
|
|
1341
|
-
})
|
|
489
|
+
return data
|
|
1342
490
|
else:
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
for val in sample_values:
|
|
1350
|
-
inferred_type = self._infer_data_type(val, no_log=True)
|
|
1351
|
-
if inferred_type:
|
|
1352
|
-
break
|
|
1353
|
-
if not inferred_type:
|
|
1354
|
-
inferred_type = 'VARCHAR(255)'
|
|
1355
|
-
filtered_set_typ[col] = inferred_type
|
|
1356
|
-
logger.debug(f"自动推断列 `{col}` 的数据类型为: `{inferred_type}`")
|
|
1357
|
-
|
|
491
|
+
raise ValueError("数据格式必须是字典、字典列表或DataFrame")
|
|
492
|
+
|
|
493
|
+
@staticmethod
|
|
494
|
+
def prepare_data_for_insert(data: List[Dict], set_typ: Dict[str, str],
|
|
495
|
+
allow_null: bool = False) -> List[Dict]:
|
|
496
|
+
"""准备插入数据"""
|
|
1358
497
|
prepared_data = []
|
|
498
|
+
|
|
1359
499
|
for row_idx, row in enumerate(data, 1):
|
|
1360
500
|
prepared_row = {}
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
# 对于自动时间戳字段,跳过处理,让MySQL自动处理
|
|
1366
|
-
col_type_lower = filtered_set_typ[col_name].lower()
|
|
1367
|
-
is_auto_timestamp = ('timestamp' in col_type_lower and 'current_timestamp' in col_type_lower and
|
|
1368
|
-
col_name in ['创建时间', '更新时间'])
|
|
1369
|
-
|
|
1370
|
-
if is_auto_timestamp:
|
|
1371
|
-
# 自动时间戳字段完全跳过,不在INSERT语句中包含
|
|
501
|
+
|
|
502
|
+
for col_name, col_type in set_typ.items():
|
|
503
|
+
# 跳过系统列(id, create_at, update_at由MySQL自动处理)
|
|
504
|
+
if col_name.lower() in ['id', 'create_at', 'update_at']:
|
|
1372
505
|
continue
|
|
1373
506
|
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
error_msg = f"行号:{row_idx} -> 缺失列: `{col_name}`, 且不允许空值"
|
|
1388
|
-
logger.error(error_msg, {'row': self._shorten_for_log(row)})
|
|
1389
|
-
raise ValueError(error_msg)
|
|
1390
|
-
except Exception:
|
|
1391
|
-
error_msg = f"行号:{row_idx} -> 缺失列: `{col_name}`, 且不允许空值"
|
|
1392
|
-
logger.error(error_msg, {'row': self._shorten_for_log(row)})
|
|
1393
|
-
raise ValueError(error_msg)
|
|
1394
|
-
else:
|
|
1395
|
-
prepared_row[col_name] = None
|
|
1396
|
-
else:
|
|
1397
|
-
# 处理用户传入的值
|
|
1398
|
-
try:
|
|
1399
|
-
prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name], allow_null, db_name, table_name, col_name)
|
|
1400
|
-
except ValueError as e:
|
|
1401
|
-
# 如果数据验证失败,检查是否为空值且不允许空值,尝试使用兜底值
|
|
1402
|
-
original_value = row[col_name]
|
|
1403
|
-
is_empty_original = (original_value is None or
|
|
1404
|
-
original_value == '' or
|
|
1405
|
-
(not isinstance(original_value, (list, dict)) and
|
|
1406
|
-
pd.isna(original_value) if hasattr(pd, 'isna') else False))
|
|
1407
|
-
|
|
1408
|
-
if is_empty_original and not allow_null:
|
|
1409
|
-
try:
|
|
1410
|
-
fallback_value = self._get_fallback_value(filtered_set_typ[col_name].lower(), allow_null, db_name, table_name, col_name, original_value)
|
|
1411
|
-
if fallback_value is not None:
|
|
1412
|
-
prepared_row[col_name] = fallback_value
|
|
1413
|
-
logger.warning(f"行:{row_idx}, 列:`{col_name}` -> 原值验证失败,使用兜底值: {fallback_value}", {
|
|
1414
|
-
'原值': original_value,
|
|
1415
|
-
'兜底值': fallback_value,
|
|
1416
|
-
'row': self._shorten_for_log(row)
|
|
1417
|
-
})
|
|
1418
|
-
else:
|
|
1419
|
-
logger.error('数据验证失败', {
|
|
1420
|
-
'列': col_name,
|
|
1421
|
-
'行': row_idx,
|
|
1422
|
-
'报错': str(e),
|
|
1423
|
-
'row': self._shorten_for_log(row),
|
|
1424
|
-
})
|
|
1425
|
-
raise ValueError(f"行:{row_idx}, 列:`{col_name}`-> 报错: {str(e)}")
|
|
1426
|
-
except Exception:
|
|
1427
|
-
logger.error('数据验证失败', {
|
|
1428
|
-
'列': col_name,
|
|
1429
|
-
'行': row_idx,
|
|
1430
|
-
'报错': str(e),
|
|
1431
|
-
'row': self._shorten_for_log(row),
|
|
1432
|
-
})
|
|
1433
|
-
raise ValueError(f"行:{row_idx}, 列:`{col_name}`-> 报错: {str(e)}")
|
|
1434
|
-
else:
|
|
1435
|
-
logger.error('数据验证失败', {
|
|
1436
|
-
'列': col_name,
|
|
1437
|
-
'行': row_idx,
|
|
1438
|
-
'报错': str(e),
|
|
1439
|
-
'row': self._shorten_for_log(row),
|
|
1440
|
-
})
|
|
1441
|
-
raise ValueError(f"行:{row_idx}, 列:`{col_name}`-> 报错: {str(e)}")
|
|
1442
|
-
prepared_data.append(prepared_row)
|
|
1443
|
-
return prepared_data, filtered_set_typ
|
|
1444
|
-
|
|
1445
|
-
def upload_data(
|
|
1446
|
-
self,
|
|
1447
|
-
db_name: str,
|
|
1448
|
-
table_name: str,
|
|
1449
|
-
data: Union[Dict, List[Dict], pd.DataFrame],
|
|
1450
|
-
set_typ: Dict[str, str],
|
|
1451
|
-
primary_keys: Optional[List[str]] = None,
|
|
1452
|
-
check_duplicate: bool = False,
|
|
1453
|
-
duplicate_columns: Optional[List[str]] = None,
|
|
1454
|
-
allow_null: bool = False,
|
|
1455
|
-
partition_by: Optional[str] = None,
|
|
1456
|
-
partition_date_column: str = '日期',
|
|
1457
|
-
auto_create: bool = True,
|
|
1458
|
-
indexes: Optional[List[str]] = None,
|
|
1459
|
-
update_on_duplicate: bool = False,
|
|
1460
|
-
transaction_mode: str = "batch",
|
|
1461
|
-
unique_keys: Optional[List[List[str]]] = None,
|
|
1462
|
-
auto_timestamps: bool = False
|
|
1463
|
-
):
|
|
1464
|
-
"""
|
|
1465
|
-
上传数据到数据库的主入口方法
|
|
1466
|
-
|
|
1467
|
-
:param db_name: 数据库名
|
|
1468
|
-
:param table_name: 表名
|
|
1469
|
-
:param data: 要上传的数据,支持字典、字典列表或DataFrame格式
|
|
1470
|
-
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
|
1471
|
-
:param primary_keys: 主键列列表,可选。格式:['col1', 'col2'] 或 None
|
|
1472
|
-
:param check_duplicate: 是否检查重复数据,默认为False
|
|
1473
|
-
:param duplicate_columns: 用于检查重复的列,可选。格式:['col1', 'col2'] 或 None
|
|
1474
|
-
:param allow_null: 是否允许空值,默认为False
|
|
1475
|
-
:param partition_by: 分表方式('year'、'month'、'None'),可选
|
|
1476
|
-
:param partition_date_column: 用于分表的日期列名,默认为'日期', 默认会添加为索引
|
|
1477
|
-
:param auto_create: 表不存在时是否自动创建,默认为True
|
|
1478
|
-
:param indexes: 需要创建索引的列列表,可选。格式:['col1', 'col2'] 或 None
|
|
1479
|
-
:param update_on_duplicate: 遇到重复数据时是否更新旧数据,默认为False
|
|
1480
|
-
:param transaction_mode: 事务模式,可选值:
|
|
1481
|
-
- 'row' : 逐行提交事务(错误隔离性好)
|
|
1482
|
-
- 'batch' : 整批提交事务(性能最优)
|
|
1483
|
-
- 'hybrid' : 混合模式(每N行提交,平衡性能与安全性)
|
|
1484
|
-
:param unique_keys: 唯一约束列表,每个元素为列名列表,支持多列组合唯一约束。格式:[['col1', 'col2'], ['col3']] 或 None
|
|
1485
|
-
:param auto_timestamps: 是否自动添加创建时间和更新时间列,默认为False。启用后会自动添加'创建时间'和'更新时间'两列
|
|
1486
|
-
:raises: 可能抛出各种验证和数据库相关异常
|
|
1487
|
-
|
|
1488
|
-
---
|
|
1489
|
-
参数格式验证:
|
|
1490
|
-
|
|
1491
|
-
- primary_keys: 必须是字符串列表或None,如 ['col1', 'col2']
|
|
1492
|
-
- indexes: 必须是字符串列表或None,如 ['col1', 'col2']
|
|
1493
|
-
- unique_keys: 必须是嵌套列表或None,如 [['col1', 'col2'], ['col3']]
|
|
1494
|
-
- 错误示例:unique_keys=['col1', 'col2'] (应该是 [['col1', 'col2']])
|
|
1495
|
-
- 所有列名不能为空字符串,会自动去除首尾空格
|
|
1496
|
-
- 重复的列名会被自动去重
|
|
1497
|
-
|
|
1498
|
-
空值处理规则:
|
|
1499
|
-
- None: 直接返回None,忽略此参数
|
|
1500
|
-
- []: 空列表,返回None,忽略此参数
|
|
1501
|
-
- [[]]: 包含空列表,跳过空列表,如果最终为空则返回None
|
|
1502
|
-
- ['']: 包含空字符串,抛出异常(不允许空字符串)
|
|
1503
|
-
- [' ']: 包含纯空白字符,抛出异常(不允许纯空白字符)
|
|
1504
|
-
- ['', 'col1']: 混合空字符串和有效字符串,跳过空字符串,保留有效字符串
|
|
1505
|
-
|
|
1506
|
-
---
|
|
1507
|
-
关于 indexes 和 unique_keys 参数:
|
|
1508
|
-
|
|
1509
|
-
- indexes 创建普通索引,unique_keys 创建唯一约束
|
|
1510
|
-
- 如果同一列同时出现在 indexes 和 unique_keys 中,系统会优先创建唯一约束,跳过普通索引
|
|
1511
|
-
- 唯一约束本身就具有索引功能,因此不会重复创建普通索引
|
|
1512
|
-
- 建议:如果某列需要唯一性约束,直接使用 unique_keys 参数,无需在 indexes 中重复指定
|
|
1513
|
-
|
|
1514
|
-
---
|
|
1515
|
-
unique_keys、check_duplicate、update_on_duplicate 三者组合下的行为总结:
|
|
1516
|
-
|
|
1517
|
-
| unique_keys | check_duplicate | update_on_duplicate | 行为 |
|
|
1518
|
-
|-------------|----------------|---------------------|------------------------------|
|
|
1519
|
-
| 有/无 | False | False | 冲突时报错/跳过,不覆盖 |
|
|
1520
|
-
| 有/无 | False | True | 冲突时覆盖(ON DUPLICATE KEY)|
|
|
1521
|
-
| 有/无 | True | False | 主动查重,冲突时跳过,不覆盖 |
|
|
1522
|
-
| 有/无 | True | True | 主动查重,冲突时覆盖 |
|
|
1523
|
-
|
|
1524
|
-
- unique_keys 只决定唯一性,不决定是否覆盖。
|
|
1525
|
-
- check_duplicate=True 时,插入前主动查重,重复数据跳过或覆盖,取决于 update_on_duplicate。
|
|
1526
|
-
- update_on_duplicate=True 时,遇到唯一约束冲突会用新数据覆盖旧数据。
|
|
1527
|
-
- 只要 update_on_duplicate=True 且表存在唯一约束(如 unique_keys),无论 check_duplicate 是否为 True,都会更新旧数据(即 ON DUPLICATE KEY UPDATE 生效)。
|
|
1528
|
-
- 如需"覆盖"行为,务必设置 update_on_duplicate=True,不管 check_duplicate 是否为 True。
|
|
1529
|
-
- 如需"跳过"行为,设置 update_on_duplicate=False 即可。
|
|
1530
|
-
|
|
1531
|
-
---
|
|
1532
|
-
auto_timestamps 参数:
|
|
1533
|
-
|
|
1534
|
-
- 当 auto_timestamps=True 时,系统会自动添加'创建时间'和'更新时间'两列
|
|
1535
|
-
- 如果原始数据中已存在这两列,系统会先移除原始数据中的这些列,然后添加新的时间戳
|
|
1536
|
-
- '创建时间':记录数据首次插入的时间,使用当前时间戳
|
|
1537
|
-
- '更新时间':记录数据最后更新的时间,插入时与创建时间相同,更新时会自动更新为当前时间
|
|
1538
|
-
- 时间戳列的数据类型为 DATETIME,格式为 'YYYY-MM-DD HH:MM:SS'
|
|
1539
|
-
- 这两列会自动添加到 set_typ 中,无需手动指定
|
|
1540
|
-
- 建议在需要审计数据变更历史的表中启用此功能
|
|
1541
|
-
"""
|
|
1542
|
-
# upload_start = time.time()
|
|
1543
|
-
# 检查data参数是否为None
|
|
1544
|
-
if data is None:
|
|
1545
|
-
logger.error('data参数不能为None', {
|
|
1546
|
-
'库': db_name,
|
|
1547
|
-
'表': table_name,
|
|
1548
|
-
})
|
|
1549
|
-
raise ValueError("data参数不能为None,请传入有效的数据")
|
|
1550
|
-
|
|
1551
|
-
if isinstance(data, list) or (hasattr(data, 'shape') and hasattr(data, '__len__')):
|
|
1552
|
-
initial_row_count = len(data)
|
|
1553
|
-
else:
|
|
1554
|
-
initial_row_count = 1
|
|
1555
|
-
|
|
1556
|
-
batch_id = f"batch_{int(time.time() * 1000)}"
|
|
1557
|
-
success_flag = False
|
|
1558
|
-
dropped_rows = 0
|
|
1559
|
-
total_inserted = 0
|
|
1560
|
-
total_skipped = 0
|
|
1561
|
-
total_failed = 0
|
|
1562
|
-
validated_primary_keys = None
|
|
1563
|
-
validated_indexes = None
|
|
1564
|
-
validated_unique_keys = None
|
|
1565
|
-
prepared_data = None
|
|
1566
|
-
filtered_set_typ = None
|
|
1567
|
-
inserted = None
|
|
1568
|
-
skipped = None
|
|
1569
|
-
failed = None
|
|
1570
|
-
|
|
1571
|
-
try:
|
|
1572
|
-
# 验证参数格式
|
|
1573
|
-
validated_primary_keys = self._validate_primary_keys_format(primary_keys, db_name, table_name)
|
|
1574
|
-
validated_indexes = self._validate_indexes_format(indexes, db_name, table_name)
|
|
1575
|
-
validated_unique_keys = self._validate_unique_keys_format(unique_keys, db_name, table_name)
|
|
507
|
+
value = row.get(col_name)
|
|
508
|
+
try:
|
|
509
|
+
prepared_row[col_name] = DataValidator.validate_and_convert_value(
|
|
510
|
+
value, col_type, allow_null
|
|
511
|
+
)
|
|
512
|
+
except ValueError as e:
|
|
513
|
+
logger.error('数据验证失败', {
|
|
514
|
+
'行号': row_idx,
|
|
515
|
+
'列名': col_name,
|
|
516
|
+
'原始值': value,
|
|
517
|
+
'错误': str(e)
|
|
518
|
+
})
|
|
519
|
+
raise ValueError(f"行{row_idx}列{col_name}验证失败: {str(e)}")
|
|
1576
520
|
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
'事务模式': transaction_mode,
|
|
1593
|
-
'唯一约束': validated_unique_keys
|
|
1594
|
-
},
|
|
1595
|
-
# '数据样例': self._shorten_for_log(data, 2)
|
|
1596
|
-
})
|
|
521
|
+
prepared_data.append(prepared_row)
|
|
522
|
+
|
|
523
|
+
return prepared_data
|
|
524
|
+
|
|
525
|
+
@staticmethod
|
|
526
|
+
def partition_data_by_date(data: List[Dict], date_column: str,
|
|
527
|
+
partition_by: str) -> Dict[str, List[Dict]]:
|
|
528
|
+
"""按日期分区数据"""
|
|
529
|
+
partitioned = {}
|
|
530
|
+
table_manager = TableManager(None, None) # 只用静态方法
|
|
531
|
+
|
|
532
|
+
for row in data:
|
|
533
|
+
if date_column not in row:
|
|
534
|
+
logger.warning('缺少分区日期列', {'列名': date_column, '行数据': row})
|
|
535
|
+
continue
|
|
1597
536
|
|
|
1598
|
-
# 验证分表参数
|
|
1599
|
-
if partition_by:
|
|
1600
|
-
partition_by = str(partition_by).lower()
|
|
1601
|
-
if partition_by not in ['year', 'month']:
|
|
1602
|
-
logger.error('分表方式必须是 "year" 或 "month" 或 "None', {
|
|
1603
|
-
'库': db_name,
|
|
1604
|
-
'表': table_name,
|
|
1605
|
-
'批次': batch_id,
|
|
1606
|
-
'分表方式': partition_by,
|
|
1607
|
-
})
|
|
1608
|
-
raise ValueError("分表方式必须是 'year' 或 'month' 或 'None'")
|
|
1609
|
-
|
|
1610
|
-
# 准备数据
|
|
1611
|
-
prepared_data, filtered_set_typ = self._prepare_data(data, set_typ, allow_null, db_name, table_name, auto_timestamps)
|
|
1612
|
-
|
|
1613
|
-
# 检查数据库是否存在
|
|
1614
|
-
if not self._check_database_exists(db_name):
|
|
1615
|
-
if auto_create:
|
|
1616
|
-
self._create_database(db_name)
|
|
1617
|
-
else:
|
|
1618
|
-
logger.error('数据库不存在', {
|
|
1619
|
-
'库': db_name,
|
|
1620
|
-
})
|
|
1621
|
-
raise ValueError(f"数据库不存在: `{db_name}`")
|
|
1622
|
-
|
|
1623
|
-
# 处理分表逻辑
|
|
1624
|
-
if partition_by:
|
|
1625
|
-
partitioned_data = {}
|
|
1626
|
-
for row in prepared_data:
|
|
1627
|
-
try:
|
|
1628
|
-
if partition_date_column not in row:
|
|
1629
|
-
logger.error('异常缺失列',{
|
|
1630
|
-
'库': db_name,
|
|
1631
|
-
'表': table_name,
|
|
1632
|
-
'批次': batch_id,
|
|
1633
|
-
'缺失列': partition_date_column,
|
|
1634
|
-
'row': self._shorten_for_log(row),
|
|
1635
|
-
})
|
|
1636
|
-
dropped_rows += 1
|
|
1637
|
-
continue
|
|
1638
|
-
part_table = self._get_partition_table_name(
|
|
1639
|
-
table_name,
|
|
1640
|
-
str(row[partition_date_column]),
|
|
1641
|
-
partition_by
|
|
1642
|
-
)
|
|
1643
|
-
if part_table not in partitioned_data:
|
|
1644
|
-
partitioned_data[part_table] = []
|
|
1645
|
-
partitioned_data[part_table].append(row)
|
|
1646
|
-
except Exception as e:
|
|
1647
|
-
logger.error('分表处理异常', {
|
|
1648
|
-
'库': db_name,
|
|
1649
|
-
'表': table_name,
|
|
1650
|
-
'row_data': self._shorten_for_log(row),
|
|
1651
|
-
'error': str(e),
|
|
1652
|
-
})
|
|
1653
|
-
dropped_rows += 1
|
|
1654
|
-
continue
|
|
1655
|
-
|
|
1656
|
-
# 对每个分表执行上传
|
|
1657
|
-
total_inserted = 0
|
|
1658
|
-
total_skipped = dropped_rows # 分表异常丢弃
|
|
1659
|
-
total_failed = 0
|
|
1660
|
-
for part_table, part_data in partitioned_data.items():
|
|
1661
|
-
try:
|
|
1662
|
-
inserted, skipped, failed = self._upload_to_table(
|
|
1663
|
-
db_name, part_table, part_data, filtered_set_typ,
|
|
1664
|
-
validated_primary_keys, check_duplicate, duplicate_columns,
|
|
1665
|
-
allow_null, auto_create, partition_date_column,
|
|
1666
|
-
validated_indexes, batch_id, update_on_duplicate, transaction_mode,
|
|
1667
|
-
validated_unique_keys
|
|
1668
|
-
)
|
|
1669
|
-
total_inserted += inserted
|
|
1670
|
-
total_skipped += skipped
|
|
1671
|
-
total_failed += failed
|
|
1672
|
-
if partition_date_column in filtered_set_typ:
|
|
1673
|
-
try:
|
|
1674
|
-
self._ensure_index(db_name, part_table, partition_date_column)
|
|
1675
|
-
except Exception as e:
|
|
1676
|
-
logger.warning('分表参考字段索引创建失败', {'库': db_name, '表': part_table, '列': partition_date_column, '错误': str(e)})
|
|
1677
|
-
except Exception as e:
|
|
1678
|
-
logger.error('分表上传异常', {
|
|
1679
|
-
'库': db_name,
|
|
1680
|
-
'表': table_name,
|
|
1681
|
-
'分表': part_table,
|
|
1682
|
-
'error': str(e),
|
|
1683
|
-
'数据样例': self._shorten_for_log(part_data, 2),
|
|
1684
|
-
})
|
|
1685
|
-
continue # 跳过当前分表,继续处理其他分表
|
|
1686
|
-
else:
|
|
1687
|
-
# 不分表,直接上传
|
|
1688
|
-
inserted, skipped, failed = self._upload_to_table(
|
|
1689
|
-
db_name, table_name, prepared_data, filtered_set_typ,
|
|
1690
|
-
validated_primary_keys, check_duplicate, duplicate_columns,
|
|
1691
|
-
allow_null, auto_create, partition_date_column,
|
|
1692
|
-
validated_indexes, batch_id, update_on_duplicate, transaction_mode,
|
|
1693
|
-
validated_unique_keys
|
|
1694
|
-
)
|
|
1695
|
-
total_inserted = inserted
|
|
1696
|
-
total_skipped = skipped
|
|
1697
|
-
total_failed = failed
|
|
1698
|
-
if partition_date_column in filtered_set_typ:
|
|
1699
|
-
try:
|
|
1700
|
-
self._ensure_index(db_name, table_name, partition_date_column)
|
|
1701
|
-
except Exception as e:
|
|
1702
|
-
logger.warning('分表参考字段索引创建失败', {'库': db_name, '表': table_name, '列': partition_date_column, '错误': str(e)})
|
|
1703
|
-
|
|
1704
|
-
success_flag = True
|
|
1705
|
-
|
|
1706
|
-
except Exception as e:
|
|
1707
|
-
logger.error('上传过程发生全局错误', {
|
|
1708
|
-
'库': db_name,
|
|
1709
|
-
'表': table_name,
|
|
1710
|
-
'error': str(e),
|
|
1711
|
-
'error_type': type(e).__name__,
|
|
1712
|
-
'数据样例': self._shorten_for_log(data, 2),
|
|
1713
|
-
})
|
|
1714
|
-
return False
|
|
1715
|
-
finally:
|
|
1716
|
-
logger.info("存储完成", {
|
|
1717
|
-
'库': db_name,
|
|
1718
|
-
'表': table_name,
|
|
1719
|
-
'批次': batch_id,
|
|
1720
|
-
'finish': success_flag,
|
|
1721
|
-
'数据行': initial_row_count,
|
|
1722
|
-
'插入': total_inserted,
|
|
1723
|
-
'跳过': total_skipped,
|
|
1724
|
-
'失败': total_failed
|
|
1725
|
-
})
|
|
1726
|
-
|
|
1727
|
-
# 更新索引(只有在成功时才执行)
|
|
1728
|
-
if success_flag and validated_indexes:
|
|
1729
537
|
try:
|
|
1730
|
-
|
|
538
|
+
partition_suffix = table_manager.get_partition_table_name(
|
|
539
|
+
'', row[date_column], partition_by
|
|
540
|
+
).split('_', 1)[1] # 获取后缀部分
|
|
541
|
+
|
|
542
|
+
if partition_suffix not in partitioned:
|
|
543
|
+
partitioned[partition_suffix] = []
|
|
544
|
+
partitioned[partition_suffix].append(row)
|
|
1731
545
|
except Exception as e:
|
|
1732
|
-
logger.
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
})
|
|
1737
|
-
return True
|
|
546
|
+
logger.error('分区处理失败', {'行数据': row, '错误': str(e)})
|
|
547
|
+
continue
|
|
548
|
+
|
|
549
|
+
return partitioned
|
|
1738
550
|
|
|
1739
|
-
@_execute_with_retry
|
|
1740
|
-
def _insert_data(
|
|
1741
|
-
self,
|
|
1742
|
-
db_name: str,
|
|
1743
|
-
table_name: str,
|
|
1744
|
-
data: List[Dict],
|
|
1745
|
-
set_typ: Dict[str, str],
|
|
1746
|
-
check_duplicate: bool,
|
|
1747
|
-
duplicate_columns: Optional[List[str]],
|
|
1748
|
-
batch_id: Optional[str] = None,
|
|
1749
|
-
update_on_duplicate: bool = False,
|
|
1750
|
-
transaction_mode: str = "batch"
|
|
1751
|
-
):
|
|
1752
|
-
"""
|
|
1753
|
-
实际执行数据插入的方法
|
|
1754
551
|
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
- 'row' : 逐行提交事务(错误隔离性好)
|
|
1765
|
-
- 'batch' : 整批提交事务(性能最优)
|
|
1766
|
-
- 'hybrid' : 混合模式(每N行提交,平衡性能与安全性)
|
|
1767
|
-
"""
|
|
552
|
+
class DataInserter:
|
|
553
|
+
"""数据插入器"""
|
|
554
|
+
|
|
555
|
+
def __init__(self, connection_manager: DatabaseConnectionManager):
|
|
556
|
+
self.conn_mgr = connection_manager
|
|
557
|
+
|
|
558
|
+
def insert_data(self, db_name: str, table_name: str, data: List[Dict],
|
|
559
|
+
set_typ: Dict[str, str], update_on_duplicate: bool = False) -> Tuple[int, int, int]:
|
|
560
|
+
"""插入数据"""
|
|
1768
561
|
if not data:
|
|
1769
562
|
return 0, 0, 0
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
1776
|
-
total_inserted, total_skipped, total_failed = self._execute_batch_insert(
|
|
1777
|
-
db_name, table_name, data, set_typ,
|
|
1778
|
-
sql, check_duplicate, duplicate_columns,
|
|
1779
|
-
batch_id, transaction_mode,
|
|
1780
|
-
update_on_duplicate
|
|
1781
|
-
)
|
|
1782
|
-
logger.debug('插入完成', {
|
|
1783
|
-
'库': db_name,
|
|
1784
|
-
'表': table_name,
|
|
1785
|
-
'总计': len(data),
|
|
1786
|
-
'插入': total_inserted,
|
|
1787
|
-
'跳过': total_skipped,
|
|
1788
|
-
'失败': total_failed,
|
|
1789
|
-
'事务模式': transaction_mode,
|
|
1790
|
-
})
|
|
1791
|
-
return total_inserted, total_skipped, total_failed
|
|
1792
|
-
|
|
1793
|
-
def _validate_transaction_mode(self, mode: str) -> str:
|
|
1794
|
-
"""验证并标准化事务模式"""
|
|
1795
|
-
valid_modes = ('row', 'batch', 'hybrid')
|
|
1796
|
-
if mode.lower() not in valid_modes:
|
|
1797
|
-
logger.error('事务模式参数错误', {
|
|
1798
|
-
'错误值': mode,
|
|
1799
|
-
'可选值': valid_modes,
|
|
1800
|
-
'自动使用默认模式': 'batch',
|
|
1801
|
-
})
|
|
1802
|
-
return 'batch'
|
|
1803
|
-
return mode.lower()
|
|
1804
|
-
|
|
1805
|
-
def _build_simple_insert_sql(self, db_name, table_name, columns, update_on_duplicate):
|
|
1806
|
-
safe_columns = [self._validate_identifier(col) for col in columns]
|
|
1807
|
-
placeholders = ','.join(['%s'] * len(safe_columns))
|
|
1808
|
-
|
|
563
|
+
|
|
564
|
+
# 准备SQL语句(排除系统列)
|
|
565
|
+
columns = [col for col in set_typ.keys() if col.lower() not in ['id', 'create_at', 'update_at']]
|
|
566
|
+
safe_columns = [TableManager._sanitize_identifier(col) for col in columns]
|
|
567
|
+
placeholders = ','.join(['%s'] * len(columns))
|
|
568
|
+
|
|
1809
569
|
sql = f"""
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
570
|
+
INSERT INTO `{db_name}`.`{table_name}`
|
|
571
|
+
(`{'`,`'.join(safe_columns)}`)
|
|
572
|
+
VALUES ({placeholders})
|
|
1813
573
|
"""
|
|
1814
|
-
|
|
1815
|
-
# 情况2:不检查重复但允许更新
|
|
574
|
+
|
|
1816
575
|
if update_on_duplicate:
|
|
1817
|
-
|
|
1818
|
-
|
|
576
|
+
# 更新时只更新业务列,不更新create_at,update_at会自动更新
|
|
577
|
+
update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)" for col in safe_columns])
|
|
1819
578
|
sql += f" ON DUPLICATE KEY UPDATE {update_clause}"
|
|
1820
|
-
|
|
1821
|
-
return sql
|
|
1822
|
-
|
|
1823
|
-
def _build_duplicate_check_sql(self, db_name, table_name, all_columns,
|
|
1824
|
-
duplicate_columns, update_on_duplicate, set_typ):
|
|
1825
|
-
if duplicate_columns is None:
|
|
1826
|
-
duplicate_columns = []
|
|
1827
|
-
duplicate_columns = [_item for _item in duplicate_columns if _item.lower() not in self.base_excute_col]
|
|
1828
|
-
safe_columns = [self._validate_identifier(col) for col in all_columns]
|
|
1829
|
-
placeholders = ','.join(['%s'] * len(safe_columns))
|
|
1830
|
-
|
|
1831
|
-
# 确定排重列(排除id和更新时间列)
|
|
1832
|
-
dup_cols = duplicate_columns if duplicate_columns else all_columns
|
|
1833
|
-
|
|
1834
|
-
# 构建排重条件
|
|
1835
|
-
conditions = []
|
|
1836
|
-
for col in dup_cols:
|
|
1837
|
-
col_type = set_typ.get(col, '').lower()
|
|
1838
|
-
if col_type.startswith('decimal'):
|
|
1839
|
-
_, scale = self._get_decimal_scale(col_type)
|
|
1840
|
-
conditions.append(f"ROUND(`{col}`, {scale}) = ROUND(%s, {scale})")
|
|
1841
|
-
else:
|
|
1842
|
-
conditions.append(f"`{col}` = %s")
|
|
1843
|
-
|
|
1844
|
-
# 情况3/5:允许更新
|
|
1845
|
-
if update_on_duplicate:
|
|
1846
|
-
update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)"
|
|
1847
|
-
for col in all_columns])
|
|
1848
|
-
sql = f"""
|
|
1849
|
-
INSERT INTO `{db_name}`.`{table_name}`
|
|
1850
|
-
(`{'`,`'.join(safe_columns)}`)
|
|
1851
|
-
VALUES ({placeholders})
|
|
1852
|
-
ON DUPLICATE KEY UPDATE {update_clause}
|
|
1853
|
-
"""
|
|
1854
|
-
else:
|
|
1855
|
-
# 情况4/6:不允许更新
|
|
1856
|
-
sql = f"""
|
|
1857
|
-
INSERT INTO `{db_name}`.`{table_name}`
|
|
1858
|
-
(`{'`,`'.join(safe_columns)}`)
|
|
1859
|
-
SELECT {placeholders}
|
|
1860
|
-
FROM DUAL
|
|
1861
|
-
WHERE NOT EXISTS (
|
|
1862
|
-
SELECT 1 FROM `{db_name}`.`{table_name}`
|
|
1863
|
-
WHERE {' AND '.join(conditions)}
|
|
1864
|
-
)
|
|
1865
|
-
"""
|
|
1866
|
-
return sql
|
|
1867
|
-
|
|
1868
|
-
def _get_decimal_scale(self, decimal_type: str) -> Tuple[int, int]:
|
|
1869
|
-
"""从DECIMAL类型字符串中提取精度和标度"""
|
|
1870
|
-
match = re.search(r'\((\d+)\s*,\s*(\d+)\)', decimal_type)
|
|
1871
|
-
if match:
|
|
1872
|
-
return int(match.group(1)), int(match.group(2))
|
|
1873
|
-
return 18, 2 # 默认值
|
|
1874
|
-
|
|
1875
|
-
def _prepare_insert_sql(
|
|
1876
|
-
self,
|
|
1877
|
-
db_name: str,
|
|
1878
|
-
table_name: str,
|
|
1879
|
-
set_typ: Dict[str, str],
|
|
1880
|
-
check_duplicate: bool,
|
|
1881
|
-
duplicate_columns: Optional[List[str]],
|
|
1882
|
-
update_on_duplicate: bool
|
|
1883
|
-
) -> str:
|
|
1884
|
-
"""
|
|
1885
|
-
准备插入SQL语句, 增加StatementCache缓存
|
|
1886
|
-
"""
|
|
1887
|
-
cache_key = (db_name, table_name, tuple(sorted(set_typ.items())), check_duplicate, tuple(duplicate_columns) if duplicate_columns else (), update_on_duplicate)
|
|
1888
|
-
cached = self._prepared_statements.get(cache_key)
|
|
1889
|
-
if cached:
|
|
1890
|
-
return cached
|
|
1891
|
-
# 获取所有列名(排除id和自动时间戳字段)
|
|
1892
|
-
all_columns = []
|
|
1893
|
-
for col in set_typ.keys():
|
|
1894
|
-
if col.lower() == 'id':
|
|
1895
|
-
continue
|
|
1896
|
-
# 检查是否是自动时间戳字段
|
|
1897
|
-
col_type_lower = set_typ[col].lower()
|
|
1898
|
-
is_auto_timestamp = ('timestamp' in col_type_lower and 'current_timestamp' in col_type_lower and
|
|
1899
|
-
col in ['创建时间', '更新时间'])
|
|
1900
|
-
if not is_auto_timestamp:
|
|
1901
|
-
all_columns.append(col)
|
|
1902
|
-
if not check_duplicate:
|
|
1903
|
-
sql = self._build_simple_insert_sql(db_name, table_name, all_columns,
|
|
1904
|
-
update_on_duplicate)
|
|
1905
|
-
else:
|
|
1906
|
-
dup_cols = duplicate_columns if duplicate_columns else [
|
|
1907
|
-
col for col in all_columns
|
|
1908
|
-
if col.lower() not in self.base_excute_col
|
|
1909
|
-
]
|
|
1910
|
-
sql = self._build_duplicate_check_sql(db_name, table_name, all_columns,
|
|
1911
|
-
dup_cols, update_on_duplicate, set_typ)
|
|
1912
|
-
self._prepared_statements[cache_key] = sql
|
|
1913
|
-
return sql
|
|
1914
|
-
|
|
1915
|
-
def _execute_batch_insert(
|
|
1916
|
-
self,
|
|
1917
|
-
db_name: str,
|
|
1918
|
-
table_name: str,
|
|
1919
|
-
data: List[Dict],
|
|
1920
|
-
set_typ: Dict[str, str],
|
|
1921
|
-
sql: str,
|
|
1922
|
-
check_duplicate: bool,
|
|
1923
|
-
duplicate_columns: Optional[List[str]],
|
|
1924
|
-
batch_id: Optional[str],
|
|
1925
|
-
transaction_mode: str,
|
|
1926
|
-
update_on_duplicate: bool = False
|
|
1927
|
-
) -> Tuple[int, int, int]:
|
|
1928
|
-
"""
|
|
1929
|
-
执行批量插入操作,优化batch和hybrid模式。
|
|
1930
|
-
|
|
1931
|
-
- batch模式下,使用executemany批量插入(如SQL带ON DUPLICATE KEY UPDATE时),MySQL会对每一行单独判断唯一约束:
|
|
1932
|
-
- 不冲突的行会被正常插入。
|
|
1933
|
-
- 冲突的行会触发ON DUPLICATE KEY UPDATE,用新数据更新旧数据。
|
|
1934
|
-
- 不会因为一行冲突导致整批失败或回滚。
|
|
1935
|
-
- 只有遇到严重的数据库错误(如所有行都因唯一约束冲突且没有ON DUPLICATE KEY UPDATE),才会整体回滚。
|
|
1936
|
-
- 返回值为(插入行数, 跳过行数, 失败行数)。
|
|
1937
|
-
"""
|
|
1938
|
-
def get_optimal_batch_size(total_rows: int) -> int:
|
|
1939
|
-
if total_rows <= 100:
|
|
1940
|
-
return total_rows
|
|
1941
|
-
elif total_rows <= 1000:
|
|
1942
|
-
return 500
|
|
1943
|
-
elif total_rows <= 10000:
|
|
1944
|
-
return 1000
|
|
1945
|
-
else:
|
|
1946
|
-
return 2000
|
|
1947
|
-
|
|
1948
|
-
def ensure_basic_type(value):
|
|
1949
|
-
"""确保值是基本数据类型,如果是字典或列表则转换为字符串"""
|
|
1950
|
-
if isinstance(value, (dict, list)):
|
|
1951
|
-
try:
|
|
1952
|
-
return json.dumps(value, ensure_ascii=False)
|
|
1953
|
-
except (TypeError, ValueError):
|
|
1954
|
-
return str(value)
|
|
1955
|
-
return value
|
|
1956
579
|
|
|
1957
|
-
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
# 检查是否是自动时间戳字段
|
|
1965
|
-
col_type_lower = set_typ[col].lower()
|
|
1966
|
-
is_auto_timestamp = ('timestamp' in col_type_lower and 'current_timestamp' in col_type_lower and
|
|
1967
|
-
col in ['创建时间', '更新时间'])
|
|
1968
|
-
if not is_auto_timestamp:
|
|
1969
|
-
all_columns.append(col)
|
|
580
|
+
# 批量插入
|
|
581
|
+
return self._execute_batch_insert(sql, data, columns)
|
|
582
|
+
|
|
583
|
+
def _execute_batch_insert(self, sql: str, data: List[Dict],
|
|
584
|
+
columns: List[str]) -> Tuple[int, int, int]:
|
|
585
|
+
"""执行批量插入"""
|
|
586
|
+
batch_size = min(1000, len(data))
|
|
1970
587
|
total_inserted = 0
|
|
1971
588
|
total_skipped = 0
|
|
1972
589
|
total_failed = 0
|
|
1973
|
-
|
|
590
|
+
|
|
591
|
+
with self.conn_mgr.get_connection() as conn:
|
|
1974
592
|
with conn.cursor() as cursor:
|
|
1975
|
-
|
|
1976
|
-
|
|
1977
|
-
|
|
1978
|
-
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
|
|
1984
|
-
|
|
1985
|
-
values_list.append(values)
|
|
1986
|
-
try:
|
|
1987
|
-
cursor.executemany(sql, values_list)
|
|
1988
|
-
conn.commit()
|
|
1989
|
-
# 在batch模式下,affected_rows表示实际影响的行数
|
|
1990
|
-
# 如果update_on_duplicate为True,则affected_rows包含更新的行数
|
|
1991
|
-
# 如果update_on_duplicate为False,则affected_rows只包含插入的行数
|
|
1992
|
-
affected = cursor.rowcount if cursor.rowcount is not None else 0
|
|
1993
|
-
if update_on_duplicate:
|
|
1994
|
-
# 当启用更新时,affected_rows包含插入和更新的行数
|
|
1995
|
-
# 我们需要区分插入和更新的行数
|
|
1996
|
-
# 由于无法准确区分,我们假设所有行都是插入的
|
|
1997
|
-
total_inserted += len(batch)
|
|
1998
|
-
else:
|
|
1999
|
-
# 当不启用更新时,affected_rows只包含插入的行数
|
|
2000
|
-
total_inserted += affected
|
|
2001
|
-
total_skipped += len(batch) - affected
|
|
2002
|
-
except pymysql.err.IntegrityError as e:
|
|
2003
|
-
conn.rollback()
|
|
2004
|
-
# 在唯一约束冲突时,所有行都被跳过
|
|
2005
|
-
total_skipped += len(batch)
|
|
2006
|
-
logger.debug('批量插入唯一约束冲突,全部跳过', {'库': db_name, '表': table_name, '错误': str(e)})
|
|
2007
|
-
except Exception as e:
|
|
2008
|
-
conn.rollback()
|
|
2009
|
-
total_failed += len(batch)
|
|
2010
|
-
logger.error('批量插入失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
|
2011
|
-
elif transaction_mode == 'hybrid':
|
|
2012
|
-
hybrid_n = 100 # 可配置
|
|
2013
|
-
for i in range(0, len(data), hybrid_n):
|
|
2014
|
-
batch = data[i:i + hybrid_n]
|
|
2015
|
-
for row in batch:
|
|
2016
|
-
try:
|
|
2017
|
-
values = [ensure_basic_type(row.get(col)) for col in all_columns]
|
|
2018
|
-
if check_duplicate and not update_on_duplicate:
|
|
2019
|
-
dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
|
|
2020
|
-
values += [ensure_basic_type(row.get(col)) for col in dup_cols]
|
|
2021
|
-
cursor.execute(sql, values)
|
|
2022
|
-
affected = cursor.rowcount if cursor.rowcount is not None else 0
|
|
2023
|
-
if update_on_duplicate:
|
|
2024
|
-
# 当启用更新时,affected_rows包含插入和更新的行数
|
|
2025
|
-
# 假设所有行都是插入的,因为无法区分插入和更新
|
|
2026
|
-
total_inserted += 1
|
|
2027
|
-
else:
|
|
2028
|
-
# 当不启用更新时,affected_rows只包含插入的行数
|
|
2029
|
-
if affected > 0:
|
|
2030
|
-
total_inserted += 1
|
|
2031
|
-
else:
|
|
2032
|
-
total_skipped += 1
|
|
2033
|
-
except pymysql.err.IntegrityError as e:
|
|
2034
|
-
conn.rollback()
|
|
2035
|
-
total_skipped += 1
|
|
2036
|
-
logger.debug('hybrid单行插入唯一约束冲突,跳过', {'库': db_name, '表': table_name, '错误': str(e)})
|
|
2037
|
-
except Exception as e:
|
|
2038
|
-
conn.rollback()
|
|
2039
|
-
total_failed += 1
|
|
2040
|
-
logger.error('hybrid单行插入失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
|
593
|
+
for i in range(0, len(data), batch_size):
|
|
594
|
+
batch = data[i:i + batch_size]
|
|
595
|
+
values_list = []
|
|
596
|
+
|
|
597
|
+
for row in batch:
|
|
598
|
+
values = [self._ensure_basic_type(row.get(col)) for col in columns]
|
|
599
|
+
values_list.append(values)
|
|
600
|
+
|
|
601
|
+
try:
|
|
602
|
+
cursor.executemany(sql, values_list)
|
|
2041
603
|
conn.commit()
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
|
|
2047
|
-
|
|
2048
|
-
|
|
2049
|
-
|
|
2050
|
-
|
|
2051
|
-
|
|
2052
|
-
|
|
2053
|
-
# 假设所有行都是插入的,因为无法区分插入和更新
|
|
2054
|
-
total_inserted += 1
|
|
2055
|
-
else:
|
|
2056
|
-
# 当不启用更新时,affected_rows只包含插入的行数
|
|
2057
|
-
if affected > 0:
|
|
2058
|
-
total_inserted += 1
|
|
2059
|
-
else:
|
|
2060
|
-
total_skipped += 1
|
|
2061
|
-
conn.commit()
|
|
2062
|
-
except pymysql.err.IntegrityError as e:
|
|
2063
|
-
conn.rollback()
|
|
2064
|
-
total_skipped += 1
|
|
2065
|
-
logger.debug('单行插入唯一约束冲突,跳过', {'库': db_name, '表': table_name, '错误': str(e)})
|
|
2066
|
-
except Exception as e:
|
|
2067
|
-
conn.rollback()
|
|
2068
|
-
total_failed += 1
|
|
2069
|
-
logger.error('单行插入失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
|
604
|
+
affected = cursor.rowcount if cursor.rowcount is not None else len(batch)
|
|
605
|
+
total_inserted += affected
|
|
606
|
+
except pymysql.err.IntegrityError:
|
|
607
|
+
conn.rollback()
|
|
608
|
+
total_skipped += len(batch)
|
|
609
|
+
logger.debug('批量插入唯一约束冲突,跳过', {'批次大小': len(batch)})
|
|
610
|
+
except Exception as e:
|
|
611
|
+
conn.rollback()
|
|
612
|
+
total_failed += len(batch)
|
|
613
|
+
logger.error('批量插入失败', {'错误': str(e), '批次大小': len(batch)})
|
|
614
|
+
|
|
2070
615
|
return total_inserted, total_skipped, total_failed
|
|
2071
|
-
|
|
2072
|
-
def _check_pool_health(self) -> bool:
|
|
2073
|
-
"""
|
|
2074
|
-
检查连接池健康状态,防止连接泄露
|
|
2075
|
-
"""
|
|
2076
|
-
conn = None
|
|
2077
|
-
try:
|
|
2078
|
-
if not hasattr(self, 'pool') or self.pool is None:
|
|
2079
|
-
return False
|
|
2080
|
-
conn = self.pool.connection()
|
|
2081
|
-
conn.ping(reconnect=True)
|
|
2082
|
-
logger.debug('连接池健康检查通过')
|
|
2083
|
-
return True
|
|
2084
|
-
except Exception as e:
|
|
2085
|
-
logger.warning('连接池健康检查失败', {'error': str(e)})
|
|
2086
|
-
return False
|
|
2087
|
-
finally:
|
|
2088
|
-
if conn is not None:
|
|
2089
|
-
try:
|
|
2090
|
-
conn.close()
|
|
2091
|
-
except Exception as e:
|
|
2092
|
-
logger.warning('关闭连接时出错', {'error': str(e)})
|
|
2093
|
-
|
|
616
|
+
|
|
2094
617
|
@staticmethod
|
|
2095
|
-
def
|
|
2096
|
-
"""
|
|
2097
|
-
|
|
2098
|
-
:param max_retries: 最大重试次数
|
|
2099
|
-
:param delay: 重试间隔(秒)
|
|
2100
|
-
:return: 装饰器
|
|
2101
|
-
"""
|
|
2102
|
-
def decorator(func):
|
|
2103
|
-
@wraps(func)
|
|
2104
|
-
def wrapper(*args, **kwargs):
|
|
2105
|
-
last_exception = None
|
|
2106
|
-
for attempt in range(max_retries):
|
|
2107
|
-
try:
|
|
2108
|
-
return func(*args, **kwargs)
|
|
2109
|
-
except (pymysql.OperationalError, pymysql.InterfaceError) as e:
|
|
2110
|
-
last_exception = e
|
|
2111
|
-
logger.warning('操作失败,准备重试', {'attempt': attempt + 1, 'error': str(e)})
|
|
2112
|
-
if attempt < max_retries - 1:
|
|
2113
|
-
time.sleep(delay * (attempt + 1))
|
|
2114
|
-
continue
|
|
2115
|
-
logger.error(f'操作重试 {max_retries} 次后失败', {'error': str(e)})
|
|
2116
|
-
raise
|
|
2117
|
-
except Exception as e:
|
|
2118
|
-
logger.error('操作失败', {'error': str(e)})
|
|
2119
|
-
raise
|
|
2120
|
-
raise last_exception if last_exception else logger.error('操作重试失败,未知错误')
|
|
2121
|
-
return wrapper
|
|
2122
|
-
return decorator
|
|
2123
|
-
|
|
2124
|
-
def _shorten_for_log(self, obj: Any, maxlen: int = 200) -> Any:
|
|
2125
|
-
"""
|
|
2126
|
-
日志安全截断工具:对字符串、列表、字典等做长度限制,避免日志过长。
|
|
2127
|
-
:param obj: 原始对象
|
|
2128
|
-
:param maxlen: 最大长度/元素数
|
|
2129
|
-
:return: 截断后的对象
|
|
2130
|
-
"""
|
|
2131
|
-
if isinstance(obj, str):
|
|
2132
|
-
return obj[:maxlen] + ("..." if len(obj) > maxlen else "")
|
|
2133
|
-
elif isinstance(obj, list):
|
|
2134
|
-
return obj[:maxlen] + (["..."] if len(obj) > maxlen else [])
|
|
2135
|
-
elif isinstance(obj, dict):
|
|
2136
|
-
short = {k: self._shorten_for_log(v, maxlen) for i, (k, v) in enumerate(obj.items()) if i < maxlen}
|
|
2137
|
-
if len(obj) > maxlen:
|
|
2138
|
-
short['...'] = f"total_keys={len(obj)}"
|
|
2139
|
-
return short
|
|
2140
|
-
elif hasattr(obj, 'shape') and hasattr(obj, 'head'):
|
|
2141
|
-
# pandas DataFrame
|
|
2142
|
-
return f"DataFrame shape={obj.shape}, head={obj.head(1).to_dict()}"
|
|
2143
|
-
return obj
|
|
2144
|
-
|
|
2145
|
-
def _normalize_col(self, col: str) -> str:
|
|
2146
|
-
"""
|
|
2147
|
-
列名自动清洗并转小写(如case_sensitive为False),保证和表结构一致。
|
|
2148
|
-
"""
|
|
2149
|
-
safe = self._validate_identifier(col)
|
|
2150
|
-
return safe if self.case_sensitive else safe.lower()
|
|
2151
|
-
|
|
2152
|
-
def _update_indexes(self, db_name: str, table_name: str, indexes: Optional[List[str]]):
|
|
2153
|
-
"""
|
|
2154
|
-
更新索引,避免重复添加或更新,同时注意大小写一致性。
|
|
2155
|
-
注意:如果列已经在unique_keys中定义,则不会重复创建普通索引。
|
|
2156
|
-
|
|
2157
|
-
:param db_name: 数据库名
|
|
2158
|
-
:param table_name: 表名
|
|
2159
|
-
:param indexes: 需要更新的索引列列表
|
|
2160
|
-
"""
|
|
2161
|
-
if not indexes:
|
|
2162
|
-
return
|
|
2163
|
-
|
|
2164
|
-
# 规范化索引列名
|
|
2165
|
-
normalized_indexes = [self._normalize_col(idx) for idx in indexes]
|
|
2166
|
-
|
|
2167
|
-
# 获取现有索引(包括普通索引和唯一约束)
|
|
2168
|
-
try:
|
|
2169
|
-
existing_indexes = self._get_existing_indexes(db_name, table_name)
|
|
2170
|
-
except Exception as e:
|
|
2171
|
-
logger.error('获取现有索引时发生错误', {'库': db_name, '表': table_name, '错误': str(e)})
|
|
2172
|
-
raise
|
|
2173
|
-
|
|
2174
|
-
# 获取表中现有的列名
|
|
2175
|
-
try:
|
|
2176
|
-
existing_columns = self._get_table_columns(db_name, table_name)
|
|
2177
|
-
except Exception as e:
|
|
2178
|
-
logger.error('获取现有列时发生错误', {'库': db_name, '表': table_name, '错误': str(e)})
|
|
2179
|
-
raise
|
|
2180
|
-
|
|
2181
|
-
# 找出需要添加的索引(排除已存在的索引和不在表中的列)
|
|
2182
|
-
indexes_to_add = []
|
|
2183
|
-
for idx in normalized_indexes:
|
|
2184
|
-
if idx not in existing_indexes and idx in existing_columns:
|
|
2185
|
-
indexes_to_add.append(idx)
|
|
2186
|
-
elif idx in existing_indexes:
|
|
2187
|
-
logger.debug('索引已存在,跳过', {'库': db_name, '表': table_name, '列': idx})
|
|
2188
|
-
elif idx not in existing_columns:
|
|
2189
|
-
logger.warning('索引列不存在于表中,跳过', {'库': db_name, '表': table_name, '列': idx})
|
|
2190
|
-
|
|
2191
|
-
# 添加新索引
|
|
2192
|
-
for idx in indexes_to_add:
|
|
618
|
+
def _ensure_basic_type(value):
|
|
619
|
+
"""确保值是基本数据类型"""
|
|
620
|
+
if isinstance(value, (dict, list)):
|
|
2193
621
|
try:
|
|
2194
|
-
|
|
2195
|
-
except
|
|
2196
|
-
|
|
2197
|
-
|
|
2198
|
-
|
|
2199
|
-
def _get_existing_indexes(self, db_name: str, table_name: str) -> Set[str]:
|
|
2200
|
-
"""
|
|
2201
|
-
获取表中现有的索引列名(包括普通索引和唯一约束)。
|
|
2202
|
-
|
|
2203
|
-
:param db_name: 数据库名
|
|
2204
|
-
:param table_name: 表名
|
|
2205
|
-
:return: 现有索引列名的集合
|
|
2206
|
-
"""
|
|
2207
|
-
sql = """
|
|
2208
|
-
SELECT COLUMN_NAME
|
|
2209
|
-
FROM INFORMATION_SCHEMA.STATISTICS
|
|
2210
|
-
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
|
2211
|
-
"""
|
|
2212
|
-
existing_indexes = set()
|
|
2213
|
-
try:
|
|
2214
|
-
with self._get_connection() as conn:
|
|
2215
|
-
with conn.cursor() as cursor:
|
|
2216
|
-
cursor.execute(sql, (db_name, table_name))
|
|
2217
|
-
existing_indexes = {row['COLUMN_NAME'] for row in cursor.fetchall()}
|
|
2218
|
-
except Exception as e:
|
|
2219
|
-
logger.error('获取现有索引失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
|
2220
|
-
raise
|
|
2221
|
-
return existing_indexes
|
|
2222
|
-
|
|
2223
|
-
def _add_index(self, db_name: str, table_name: str, column: str):
|
|
2224
|
-
"""
|
|
2225
|
-
添加索引到指定列。
|
|
2226
|
-
|
|
2227
|
-
:param db_name: 数据库名
|
|
2228
|
-
:param table_name: 表名
|
|
2229
|
-
:param column: 需要添加索引的列名
|
|
2230
|
-
"""
|
|
2231
|
-
sql = f'ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{column}` (`{column}`)'
|
|
2232
|
-
try:
|
|
2233
|
-
with self._get_connection() as conn:
|
|
2234
|
-
with conn.cursor() as cursor:
|
|
2235
|
-
cursor.execute(sql)
|
|
2236
|
-
conn.commit()
|
|
2237
|
-
logger.debug('已为列创建索引', {'库': db_name, '表': table_name, '列': column})
|
|
2238
|
-
except Exception as e:
|
|
2239
|
-
logger.error('创建索引失败', {'库': db_name, '表': table_name, '列': column, '错误': str(e)})
|
|
2240
|
-
raise
|
|
622
|
+
return json.dumps(value, ensure_ascii=False)
|
|
623
|
+
except (TypeError, ValueError):
|
|
624
|
+
return str(value)
|
|
625
|
+
return value
|
|
2241
626
|
|
|
2242
|
-
@_execute_with_retry
|
|
2243
|
-
def _add_column_to_table(self, db_name: str, table_name: str, column: str, column_type: str, allow_null: bool = False):
|
|
2244
|
-
"""
|
|
2245
|
-
添加列到指定表。
|
|
2246
627
|
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
table_name = self._validate_identifier(table_name)
|
|
2255
|
-
column = self._validate_identifier(column)
|
|
2256
|
-
|
|
2257
|
-
# 构建ALTER TABLE语句
|
|
2258
|
-
null_constraint = "NULL" if allow_null else "NOT NULL"
|
|
2259
|
-
|
|
2260
|
-
# 为新添加的列设置默认值
|
|
2261
|
-
default_value = ""
|
|
2262
|
-
if not allow_null:
|
|
2263
|
-
column_type_lower = column_type.lower()
|
|
2264
|
-
if any(t in column_type_lower for t in ['int', 'bigint', 'tinyint', 'smallint', 'mediumint']):
|
|
2265
|
-
default_value = " DEFAULT 0"
|
|
2266
|
-
elif any(t in column_type_lower for t in ['decimal', 'float', 'double']):
|
|
2267
|
-
default_value = " DEFAULT 0.0"
|
|
2268
|
-
elif any(t in column_type_lower for t in ['varchar', 'text', 'char', 'mediumtext', 'longtext']):
|
|
2269
|
-
default_value = " DEFAULT 'none'"
|
|
2270
|
-
elif 'timestamp' in column_type_lower:
|
|
2271
|
-
# TIMESTAMP类型已经包含DEFAULT定义,不需要额外添加
|
|
2272
|
-
default_value = ""
|
|
2273
|
-
elif 'date' in column_type_lower:
|
|
2274
|
-
if 'datetime' in column_type_lower:
|
|
2275
|
-
default_value = " DEFAULT '2000-01-01 00:00:00'"
|
|
2276
|
-
else:
|
|
2277
|
-
default_value = " DEFAULT '2000-01-01'"
|
|
2278
|
-
elif 'json' in column_type_lower:
|
|
2279
|
-
default_value = " DEFAULT '{}'"
|
|
2280
|
-
|
|
2281
|
-
# 对于TIMESTAMP类型,不添加额外的NULL约束,因为已经包含在类型定义中
|
|
2282
|
-
if 'timestamp' in column_type.lower() and ('default' in column_type.lower() or 'current_timestamp' in column_type.lower()):
|
|
2283
|
-
null_constraint = "" # TIMESTAMP类型已经包含完整定义
|
|
2284
|
-
default_value = ""
|
|
2285
|
-
|
|
2286
|
-
sql = f'ALTER TABLE `{db_name}`.`{table_name}` ADD COLUMN `{column}` {column_type} {null_constraint}{default_value}'
|
|
2287
|
-
|
|
2288
|
-
conn = None
|
|
2289
|
-
try:
|
|
2290
|
-
with self._get_connection() as conn:
|
|
2291
|
-
with conn.cursor() as cursor:
|
|
2292
|
-
cursor.execute(sql)
|
|
2293
|
-
conn.commit()
|
|
2294
|
-
logger.debug('已为表添加列', {
|
|
2295
|
-
'库': db_name,
|
|
2296
|
-
'表': table_name,
|
|
2297
|
-
'列': column,
|
|
2298
|
-
'类型': column_type,
|
|
2299
|
-
'允许空值': allow_null
|
|
2300
|
-
})
|
|
2301
|
-
except Exception as e:
|
|
2302
|
-
logger.error('添加列失败', {
|
|
2303
|
-
'库': db_name,
|
|
2304
|
-
'表': table_name,
|
|
2305
|
-
'列': column,
|
|
2306
|
-
'类型': column_type,
|
|
2307
|
-
'错误': str(e),
|
|
2308
|
-
'SQL': sql
|
|
2309
|
-
})
|
|
2310
|
-
if conn is not None:
|
|
2311
|
-
conn.rollback()
|
|
2312
|
-
raise
|
|
2313
|
-
|
|
2314
|
-
def __enter__(self):
|
|
2315
|
-
return self
|
|
2316
|
-
|
|
2317
|
-
def close(self) -> None:
|
|
2318
|
-
"""
|
|
2319
|
-
关闭连接池并清理资源
|
|
2320
|
-
这个方法会安全地关闭数据库连接池,并清理相关资源。
|
|
2321
|
-
建议结束时手动调用此方法。
|
|
2322
|
-
:raises: 可能抛出关闭连接时的异常
|
|
2323
|
-
"""
|
|
2324
|
-
try:
|
|
2325
|
-
if hasattr(self, 'pool') and self.pool is not None:
|
|
628
|
+
def retry_on_failure(max_retries: int = 3, delay: int = 1):
|
|
629
|
+
"""重试装饰器"""
|
|
630
|
+
def decorator(func):
|
|
631
|
+
@wraps(func)
|
|
632
|
+
def wrapper(*args, **kwargs):
|
|
633
|
+
last_exception = None
|
|
634
|
+
for attempt in range(max_retries):
|
|
2326
635
|
try:
|
|
2327
|
-
|
|
636
|
+
return func(*args, **kwargs)
|
|
637
|
+
except (pymysql.OperationalError, pymysql.InterfaceError) as e:
|
|
638
|
+
last_exception = e
|
|
639
|
+
if attempt < max_retries - 1:
|
|
640
|
+
logger.warning('操作失败,准备重试', {
|
|
641
|
+
'尝试次数': attempt + 1,
|
|
642
|
+
'错误': str(e)
|
|
643
|
+
})
|
|
644
|
+
time.sleep(delay * (attempt + 1))
|
|
645
|
+
continue
|
|
646
|
+
logger.error(f'操作重试{max_retries}次后失败', {'错误': str(e)})
|
|
647
|
+
raise
|
|
2328
648
|
except Exception as e:
|
|
2329
|
-
logger.
|
|
2330
|
-
|
|
2331
|
-
|
|
2332
|
-
|
|
2333
|
-
|
|
649
|
+
logger.error('操作失败', {'错误': str(e)})
|
|
650
|
+
raise
|
|
651
|
+
raise last_exception
|
|
652
|
+
return wrapper
|
|
653
|
+
return decorator
|
|
2334
654
|
|
|
2335
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
2336
|
-
self.close()
|
|
2337
655
|
|
|
2338
|
-
|
|
2339
|
-
|
|
2340
|
-
|
|
2341
|
-
|
|
2342
|
-
|
|
2343
|
-
|
|
2344
|
-
|
|
2345
|
-
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
|
|
2349
|
-
|
|
2350
|
-
|
|
2351
|
-
|
|
2352
|
-
|
|
2353
|
-
with self._get_connection() as conn:
|
|
2354
|
-
with conn.cursor() as cursor:
|
|
2355
|
-
cursor.execute(sql, params)
|
|
2356
|
-
results = cursor.fetchall()
|
|
2357
|
-
logger.debug('查询执行成功', {
|
|
2358
|
-
'sql': self._shorten_for_log(sql, 100),
|
|
2359
|
-
'params': self._shorten_for_log(params, 50),
|
|
2360
|
-
'结果数量': len(results)
|
|
2361
|
-
})
|
|
2362
|
-
return results
|
|
2363
|
-
except Exception as e:
|
|
2364
|
-
logger.error('执行查询时出错', {
|
|
2365
|
-
'sql': self._shorten_for_log(sql, 100),
|
|
2366
|
-
'params': self._shorten_for_log(params, 50),
|
|
2367
|
-
'error': str(e)
|
|
2368
|
-
})
|
|
2369
|
-
raise
|
|
2370
|
-
|
|
2371
|
-
# @_execute_with_retry
|
|
2372
|
-
def execute_update(self, sql: str, params: Optional[Tuple] = None) -> int:
|
|
656
|
+
class MySQLUploader:
|
|
657
|
+
"""
|
|
658
|
+
MySQL数据上传器 - 重构版本
|
|
659
|
+
|
|
660
|
+
特性:
|
|
661
|
+
- 自动为每个表添加id(BIGINT自增主键)、create_at、update_at时间戳列
|
|
662
|
+
- 支持自动建表、分表、数据类型推断
|
|
663
|
+
- 高可用连接池管理和重试机制
|
|
664
|
+
- 批量插入优化
|
|
665
|
+
"""
|
|
666
|
+
|
|
667
|
+
def __init__(self, username: str, password: str, host: str = 'localhost',
|
|
668
|
+
port: int = 3306, charset: str = 'utf8mb4',
|
|
669
|
+
collation: str = 'utf8mb4_0900_ai_ci', pool_size: int = 5,
|
|
670
|
+
max_retries: int = 3, **kwargs):
|
|
2373
671
|
"""
|
|
2374
|
-
|
|
672
|
+
初始化MySQL上传器
|
|
2375
673
|
|
|
2376
|
-
:param
|
|
2377
|
-
:param
|
|
2378
|
-
:
|
|
2379
|
-
:
|
|
674
|
+
:param username: 数据库用户名
|
|
675
|
+
:param password: 数据库密码
|
|
676
|
+
:param host: 数据库主机地址
|
|
677
|
+
:param port: 数据库端口
|
|
678
|
+
:param charset: 字符集
|
|
679
|
+
:param collation: 排序规则
|
|
680
|
+
:param pool_size: 连接池大小
|
|
681
|
+
:param max_retries: 最大重试次数
|
|
2380
682
|
"""
|
|
2381
|
-
|
|
2382
|
-
|
|
2383
|
-
|
|
683
|
+
self.config = {
|
|
684
|
+
'username': username,
|
|
685
|
+
'password': password,
|
|
686
|
+
'host': host,
|
|
687
|
+
'port': port,
|
|
688
|
+
'charset': charset,
|
|
689
|
+
'pool_size': pool_size,
|
|
690
|
+
**kwargs
|
|
691
|
+
}
|
|
692
|
+
self.collation = collation
|
|
693
|
+
self.max_retries = max_retries
|
|
2384
694
|
|
|
2385
|
-
|
|
695
|
+
# 初始化组件
|
|
696
|
+
self.conn_mgr = DatabaseConnectionManager(self.config)
|
|
697
|
+
self.table_mgr = TableManager(self.conn_mgr, collation)
|
|
698
|
+
self.data_inserter = DataInserter(self.conn_mgr)
|
|
699
|
+
|
|
700
|
+
@retry_on_failure(max_retries=3)
|
|
701
|
+
def upload_data(self, db_name: str, table_name: str,
|
|
702
|
+
data: Union[Dict, List[Dict], pd.DataFrame],
|
|
703
|
+
set_typ: Optional[Dict[str, str]] = None,
|
|
704
|
+
allow_null: bool = False,
|
|
705
|
+
partition_by: Optional[str] = None,
|
|
706
|
+
partition_date_column: str = '日期',
|
|
707
|
+
update_on_duplicate: bool = False,
|
|
708
|
+
unique_keys: Optional[List[List[str]]] = None) -> bool:
|
|
709
|
+
"""
|
|
710
|
+
上传数据到MySQL数据库
|
|
711
|
+
|
|
712
|
+
注意:系统会自动为每个表添加以下系统列:
|
|
713
|
+
- id: BIGINT自增主键
|
|
714
|
+
- create_at: 创建时间戳(插入时自动设置)
|
|
715
|
+
- update_at: 更新时间戳(插入和更新时自动设置)
|
|
716
|
+
|
|
717
|
+
:param db_name: 数据库名(会自动转为小写)
|
|
718
|
+
:param table_name: 表名(会自动转为小写)
|
|
719
|
+
:param data: 要上传的数据
|
|
720
|
+
:param set_typ: 列类型定义,如果为None则自动推断(无需包含系统列)
|
|
721
|
+
:param allow_null: 是否允许空值
|
|
722
|
+
:param partition_by: 分表方式('year'或'month')
|
|
723
|
+
:param partition_date_column: 分表日期列名
|
|
724
|
+
:param update_on_duplicate: 遇到重复数据时是否更新
|
|
725
|
+
:param unique_keys: 唯一约束列表(无需包含系统列)
|
|
726
|
+
:return: 上传是否成功
|
|
727
|
+
"""
|
|
728
|
+
db_name = db_name.lower()
|
|
729
|
+
table_name = table_name.lower()
|
|
2386
730
|
try:
|
|
2387
|
-
|
|
2388
|
-
with conn.cursor() as cursor:
|
|
2389
|
-
affected_rows = cursor.execute(sql, params)
|
|
2390
|
-
conn.commit()
|
|
2391
|
-
logger.debug('更新执行成功', {
|
|
2392
|
-
'sql': self._shorten_for_log(sql, 100),
|
|
2393
|
-
'params': self._shorten_for_log(params, 50),
|
|
2394
|
-
'影响行数': affected_rows
|
|
2395
|
-
})
|
|
2396
|
-
return affected_rows
|
|
2397
|
-
except Exception as e:
|
|
2398
|
-
logger.error('执行更新时出错', {
|
|
2399
|
-
'sql': self._shorten_for_log(sql, 100),
|
|
2400
|
-
'params': self._shorten_for_log(params, 50),
|
|
2401
|
-
'error': str(e)
|
|
2402
|
-
})
|
|
2403
|
-
if conn is not None:
|
|
2404
|
-
conn.rollback()
|
|
2405
|
-
raise
|
|
2406
|
-
|
|
2407
|
-
def _validate_unique_keys_format(self, unique_keys: Optional[List[List[str]]], db_name: str = None, table_name: str = None) -> Optional[List[List[str]]]:
|
|
2408
|
-
"""
|
|
2409
|
-
验证unique_keys参数的格式是否正确
|
|
2410
|
-
|
|
2411
|
-
:param unique_keys: 唯一约束列表
|
|
2412
|
-
:param db_name: 数据库名,用于日志记录
|
|
2413
|
-
:param table_name: 表名,用于日志记录
|
|
2414
|
-
:return: 验证后的unique_keys,如果验证失败则抛出异常
|
|
2415
|
-
:raises ValueError: 当参数格式不正确时抛出
|
|
2416
|
-
"""
|
|
2417
|
-
if unique_keys is None:
|
|
2418
|
-
return None
|
|
731
|
+
start_time = time.time()
|
|
2419
732
|
|
|
2420
|
-
|
|
2421
|
-
|
|
2422
|
-
|
|
2423
|
-
|
|
2424
|
-
|
|
2425
|
-
# 检查是否为空列表
|
|
2426
|
-
if len(unique_keys) == 0:
|
|
2427
|
-
logger.debug('unique_keys为空列表,将忽略此参数', {'库': db_name, '表': table_name})
|
|
2428
|
-
return None
|
|
2429
|
-
|
|
2430
|
-
validated_keys = []
|
|
2431
|
-
empty_groups_count = 0
|
|
2432
|
-
|
|
2433
|
-
for i, key_group in enumerate(unique_keys):
|
|
2434
|
-
# 检查每个元素是否为列表
|
|
2435
|
-
if not isinstance(key_group, list):
|
|
2436
|
-
error_msg = f"unique_keys[{i}]必须是列表类型,当前类型: {type(key_group).__name__},值: {key_group}"
|
|
2437
|
-
logger.error(error_msg, {'库': db_name, '表': table_name, 'unique_keys': unique_keys})
|
|
2438
|
-
raise ValueError(error_msg)
|
|
733
|
+
# 标准化数据
|
|
734
|
+
normalized_data = DataProcessor.normalize_data(data)
|
|
735
|
+
if not normalized_data:
|
|
736
|
+
logger.warning('数据为空,跳过上传')
|
|
737
|
+
return True
|
|
2439
738
|
|
|
2440
|
-
#
|
|
2441
|
-
if
|
|
2442
|
-
|
|
2443
|
-
logger.
|
|
2444
|
-
continue
|
|
2445
|
-
|
|
2446
|
-
# 检查每个列名是否为字符串
|
|
2447
|
-
validated_group = []
|
|
2448
|
-
for j, col_name in enumerate(key_group):
|
|
2449
|
-
if not isinstance(col_name, str):
|
|
2450
|
-
error_msg = f"unique_keys[{i}][{j}]必须是字符串类型,当前类型: {type(col_name).__name__},值: {col_name}"
|
|
2451
|
-
logger.error(error_msg, {'库': db_name, '表': table_name, 'unique_keys': unique_keys})
|
|
2452
|
-
raise ValueError(error_msg)
|
|
2453
|
-
|
|
2454
|
-
# 检查是否为空字符串或纯空白字符
|
|
2455
|
-
stripped_name = col_name.strip()
|
|
2456
|
-
if not stripped_name:
|
|
2457
|
-
error_msg = f"unique_keys[{i}][{j}]不能为空字符串或纯空白字符,原始值: '{col_name}'"
|
|
2458
|
-
logger.error(error_msg, {'库': db_name, '表': table_name, 'unique_keys': unique_keys})
|
|
2459
|
-
raise ValueError(error_msg)
|
|
2460
|
-
|
|
2461
|
-
validated_group.append(stripped_name)
|
|
739
|
+
# 推断或验证列类型
|
|
740
|
+
if set_typ is None:
|
|
741
|
+
set_typ = DataTypeInferrer.infer_types_from_data(normalized_data)
|
|
742
|
+
logger.info('自动推断数据类型', {'类型映射': set_typ})
|
|
2462
743
|
|
|
2463
|
-
#
|
|
2464
|
-
|
|
2465
|
-
error_msg = f"unique_keys[{i}]中存在重复列名: {validated_group}"
|
|
2466
|
-
logger.error(error_msg, {'库': db_name, '表': table_name, 'unique_keys': unique_keys})
|
|
2467
|
-
raise ValueError(error_msg)
|
|
744
|
+
# 确保数据库存在
|
|
745
|
+
self.table_mgr.ensure_database_exists(db_name)
|
|
2468
746
|
|
|
2469
|
-
|
|
2470
|
-
|
|
2471
|
-
|
|
2472
|
-
|
|
2473
|
-
|
|
2474
|
-
|
|
2475
|
-
|
|
2476
|
-
})
|
|
747
|
+
# 处理分表逻辑
|
|
748
|
+
if partition_by:
|
|
749
|
+
return self._handle_partitioned_upload(
|
|
750
|
+
db_name, table_name, normalized_data, set_typ,
|
|
751
|
+
partition_by, partition_date_column, allow_null,
|
|
752
|
+
update_on_duplicate, unique_keys
|
|
753
|
+
)
|
|
2477
754
|
else:
|
|
2478
|
-
|
|
2479
|
-
|
|
2480
|
-
|
|
2481
|
-
|
|
2482
|
-
'库': db_name,
|
|
2483
|
-
'表': table_name,
|
|
2484
|
-
'原始': unique_keys,
|
|
2485
|
-
'验证后': validated_keys,
|
|
2486
|
-
'跳过的空列表': empty_groups_count
|
|
2487
|
-
})
|
|
2488
|
-
return validated_keys
|
|
2489
|
-
|
|
2490
|
-
def _validate_indexes_format(self, indexes: Optional[List[str]], db_name: str = None, table_name: str = None) -> Optional[List[str]]:
|
|
2491
|
-
"""
|
|
2492
|
-
验证indexes参数的格式是否正确
|
|
755
|
+
return self._handle_single_table_upload(
|
|
756
|
+
db_name, table_name, normalized_data, set_typ,
|
|
757
|
+
allow_null, update_on_duplicate, unique_keys
|
|
758
|
+
)
|
|
2493
759
|
|
|
2494
|
-
|
|
2495
|
-
|
|
2496
|
-
|
|
2497
|
-
|
|
2498
|
-
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2502
|
-
|
|
2503
|
-
|
|
2504
|
-
|
|
2505
|
-
|
|
2506
|
-
|
|
2507
|
-
|
|
2508
|
-
|
|
2509
|
-
|
|
2510
|
-
|
|
2511
|
-
|
|
2512
|
-
|
|
2513
|
-
|
|
2514
|
-
|
|
2515
|
-
|
|
2516
|
-
for i, col_name in enumerate(indexes):
|
|
2517
|
-
if not isinstance(col_name, str):
|
|
2518
|
-
error_msg = f"indexes[{i}]必须是字符串类型,当前类型: {type(col_name).__name__},值: {col_name}"
|
|
2519
|
-
logger.error(error_msg, {'库': db_name, '表': table_name, 'indexes': indexes})
|
|
2520
|
-
raise ValueError(error_msg)
|
|
2521
|
-
|
|
2522
|
-
# 检查是否为空字符串或纯空白字符
|
|
2523
|
-
stripped_name = col_name.strip()
|
|
2524
|
-
if not stripped_name:
|
|
2525
|
-
empty_strings_count += 1
|
|
2526
|
-
logger.warning(f'indexes[{i}]为空字符串或纯空白字符,跳过,原始值: "{col_name}"', {
|
|
2527
|
-
'库': db_name, '表': table_name, 'indexes': indexes
|
|
2528
|
-
})
|
|
2529
|
-
continue
|
|
2530
|
-
|
|
2531
|
-
validated_indexes.append(stripped_name)
|
|
760
|
+
except Exception as e:
|
|
761
|
+
logger.error('数据上传失败', {
|
|
762
|
+
'数据库': db_name,
|
|
763
|
+
'表名': table_name,
|
|
764
|
+
'错误': str(e)
|
|
765
|
+
})
|
|
766
|
+
return False
|
|
767
|
+
|
|
768
|
+
def _handle_single_table_upload(self, db_name: str, table_name: str,
|
|
769
|
+
data: List[Dict], set_typ: Dict[str, str],
|
|
770
|
+
allow_null: bool, update_on_duplicate: bool,
|
|
771
|
+
unique_keys: Optional[List[List[str]]]) -> bool:
|
|
772
|
+
"""处理单表上传"""
|
|
773
|
+
# 确保表存在
|
|
774
|
+
if not self.table_mgr.table_exists(db_name, table_name):
|
|
775
|
+
self.table_mgr.create_table(db_name, table_name, set_typ,
|
|
776
|
+
unique_keys=unique_keys)
|
|
777
|
+
|
|
778
|
+
# 准备数据
|
|
779
|
+
prepared_data = DataProcessor.prepare_data_for_insert(
|
|
780
|
+
data, set_typ, allow_null
|
|
781
|
+
)
|
|
2532
782
|
|
|
2533
|
-
#
|
|
2534
|
-
|
|
783
|
+
# 插入数据
|
|
784
|
+
inserted, skipped, failed = self.data_inserter.insert_data(
|
|
785
|
+
db_name, table_name, prepared_data, set_typ, update_on_duplicate
|
|
786
|
+
)
|
|
2535
787
|
|
|
2536
|
-
|
|
2537
|
-
|
|
2538
|
-
|
|
2539
|
-
|
|
2540
|
-
|
|
2541
|
-
|
|
2542
|
-
|
|
2543
|
-
logger.warning('indexes验证后为空,将忽略此参数', {'库': db_name, '表': table_name})
|
|
2544
|
-
return None
|
|
2545
|
-
|
|
2546
|
-
logger.debug('indexes格式验证通过', {
|
|
2547
|
-
'库': db_name,
|
|
2548
|
-
'表': table_name,
|
|
2549
|
-
'原始': indexes,
|
|
2550
|
-
'验证后': validated_indexes,
|
|
2551
|
-
'跳过的空字符串': empty_strings_count
|
|
788
|
+
logger.info('单表上传完成', {
|
|
789
|
+
'数据库': db_name,
|
|
790
|
+
'表名': table_name,
|
|
791
|
+
'总数': len(data),
|
|
792
|
+
'插入': inserted,
|
|
793
|
+
'跳过': skipped,
|
|
794
|
+
'失败': failed
|
|
2552
795
|
})
|
|
2553
|
-
return validated_indexes
|
|
2554
|
-
|
|
2555
|
-
def _validate_primary_keys_format(self, primary_keys: Optional[List[str]], db_name: str = None, table_name: str = None) -> Optional[List[str]]:
|
|
2556
|
-
"""
|
|
2557
|
-
验证primary_keys参数的格式是否正确
|
|
2558
796
|
|
|
2559
|
-
|
|
2560
|
-
|
|
2561
|
-
|
|
2562
|
-
|
|
2563
|
-
|
|
2564
|
-
|
|
2565
|
-
|
|
2566
|
-
|
|
2567
|
-
|
|
2568
|
-
|
|
2569
|
-
|
|
2570
|
-
|
|
2571
|
-
raise ValueError(error_msg)
|
|
2572
|
-
|
|
2573
|
-
# 检查是否为空列表
|
|
2574
|
-
if len(primary_keys) == 0:
|
|
2575
|
-
logger.debug('primary_keys为空列表,将忽略此参数', {'库': db_name, '表': table_name})
|
|
2576
|
-
return None
|
|
2577
|
-
|
|
2578
|
-
validated_keys = []
|
|
2579
|
-
empty_strings_count = 0
|
|
2580
|
-
|
|
2581
|
-
for i, col_name in enumerate(primary_keys):
|
|
2582
|
-
if not isinstance(col_name, str):
|
|
2583
|
-
error_msg = f"primary_keys[{i}]必须是字符串类型,当前类型: {type(col_name).__name__},值: {col_name}"
|
|
2584
|
-
logger.error(error_msg, {'库': db_name, '表': table_name, 'primary_keys': primary_keys})
|
|
2585
|
-
raise ValueError(error_msg)
|
|
2586
|
-
|
|
2587
|
-
# 检查是否为空字符串或纯空白字符
|
|
2588
|
-
stripped_name = col_name.strip()
|
|
2589
|
-
if not stripped_name:
|
|
2590
|
-
empty_strings_count += 1
|
|
2591
|
-
logger.warning(f'primary_keys[{i}]为空字符串或纯空白字符,跳过,原始值: "{col_name}"', {
|
|
2592
|
-
'库': db_name, '表': table_name, 'primary_keys': primary_keys
|
|
2593
|
-
})
|
|
2594
|
-
continue
|
|
2595
|
-
|
|
2596
|
-
validated_keys.append(stripped_name)
|
|
2597
|
-
|
|
2598
|
-
# 去重并检查是否有重复列名
|
|
2599
|
-
if len(validated_keys) != len(set(validated_keys)):
|
|
2600
|
-
error_msg = f"primary_keys中存在重复列名: {validated_keys}"
|
|
2601
|
-
logger.error(error_msg, {'库': db_name, '表': table_name, 'primary_keys': primary_keys})
|
|
2602
|
-
raise ValueError(error_msg)
|
|
2603
|
-
|
|
2604
|
-
# 检查验证后的结果
|
|
2605
|
-
if not validated_keys:
|
|
2606
|
-
if empty_strings_count > 0:
|
|
2607
|
-
logger.warning(f'primary_keys包含{empty_strings_count}个空字符串,验证后为空,将忽略此参数', {
|
|
2608
|
-
'库': db_name, '表': table_name, '空字符串数量': empty_strings_count
|
|
2609
|
-
})
|
|
2610
|
-
else:
|
|
2611
|
-
logger.warning('primary_keys验证后为空,将忽略此参数', {'库': db_name, '表': table_name})
|
|
2612
|
-
return None
|
|
2613
|
-
|
|
2614
|
-
logger.debug('primary_keys格式验证通过', {
|
|
2615
|
-
'库': db_name,
|
|
2616
|
-
'表': table_name,
|
|
2617
|
-
'原始': primary_keys,
|
|
2618
|
-
'验证后': validated_keys,
|
|
2619
|
-
'跳过的空字符串': empty_strings_count
|
|
2620
|
-
})
|
|
2621
|
-
return validated_keys
|
|
2622
|
-
|
|
2623
|
-
@staticmethod
|
|
2624
|
-
def process_df_columns(
|
|
2625
|
-
df: pd.DataFrame,
|
|
2626
|
-
columns: List[str],
|
|
2627
|
-
default_value: Any = 0
|
|
2628
|
-
) -> pd.DataFrame:
|
|
2629
|
-
"""
|
|
2630
|
-
处理DataFrame的列,补齐缺失的列并丢弃多余的列
|
|
2631
|
-
|
|
2632
|
-
:param df: 要处理的DataFrame
|
|
2633
|
-
:param columns: 所需的列名列表,注意不处理大小写
|
|
2634
|
-
:param default_value: 缺失列的填充值,默认为None
|
|
2635
|
-
:return: 处理后的DataFrame
|
|
2636
|
-
"""
|
|
2637
|
-
if df is None or not isinstance(df, pd.DataFrame) or not isinstance(columns, list) or not columns:
|
|
2638
|
-
return df
|
|
2639
|
-
|
|
2640
|
-
# 获取当前列名
|
|
2641
|
-
current_columns = list(df.columns)
|
|
2642
|
-
|
|
2643
|
-
# 找出需要添加的列和需要删除的列
|
|
2644
|
-
missing_columns = [col for col in columns if col not in current_columns]
|
|
2645
|
-
extra_columns = [col for col in current_columns if col not in columns]
|
|
2646
|
-
|
|
2647
|
-
# 复制DataFrame
|
|
2648
|
-
result_df = df.copy()
|
|
2649
|
-
|
|
2650
|
-
# 删除多余的列
|
|
2651
|
-
if extra_columns:
|
|
2652
|
-
result_df = result_df.drop(columns=extra_columns)
|
|
2653
|
-
|
|
2654
|
-
# 添加缺失的列
|
|
2655
|
-
if missing_columns:
|
|
2656
|
-
for col in missing_columns:
|
|
2657
|
-
result_df[col] = default_value
|
|
2658
|
-
|
|
2659
|
-
# 按照指定顺序重新排列列
|
|
2660
|
-
result_df = result_df.reindex(columns=columns)
|
|
2661
|
-
|
|
2662
|
-
return result_df
|
|
2663
|
-
|
|
2664
|
-
def _process_auto_timestamps(
|
|
2665
|
-
self,
|
|
2666
|
-
data: Union[Dict, List[Dict], pd.DataFrame],
|
|
2667
|
-
set_typ: Dict[str, str],
|
|
2668
|
-
db_name: str,
|
|
2669
|
-
table_name: str
|
|
2670
|
-
) -> Tuple[Union[Dict, List[Dict], pd.DataFrame], Dict[str, str]]:
|
|
2671
|
-
"""
|
|
2672
|
-
处理自动时间戳功能
|
|
2673
|
-
|
|
2674
|
-
:param data: 原始数据
|
|
2675
|
-
:param set_typ: 列类型定义
|
|
2676
|
-
:param db_name: 数据库名
|
|
2677
|
-
:param table_name: 表名
|
|
2678
|
-
:return: 处理后的数据和更新后的set_typ
|
|
2679
|
-
"""
|
|
2680
|
-
|
|
2681
|
-
# 定义时间戳列名
|
|
2682
|
-
created_col = '创建时间'
|
|
2683
|
-
updated_col = '更新时间'
|
|
2684
|
-
|
|
2685
|
-
# 复制set_typ以避免修改原始对象
|
|
2686
|
-
updated_set_typ = set_typ.copy()
|
|
797
|
+
return failed == 0
|
|
798
|
+
|
|
799
|
+
def _handle_partitioned_upload(self, db_name: str, base_table_name: str,
|
|
800
|
+
data: List[Dict], set_typ: Dict[str, str],
|
|
801
|
+
partition_by: str, partition_date_column: str,
|
|
802
|
+
allow_null: bool, update_on_duplicate: bool,
|
|
803
|
+
unique_keys: Optional[List[List[str]]]) -> bool:
|
|
804
|
+
"""处理分表上传"""
|
|
805
|
+
# 按日期分区数据
|
|
806
|
+
partitioned_data = DataProcessor.partition_data_by_date(
|
|
807
|
+
data, partition_date_column, partition_by
|
|
808
|
+
)
|
|
2687
809
|
|
|
2688
|
-
|
|
2689
|
-
# 创建时间:插入时自动设置,更新时不变
|
|
2690
|
-
updated_set_typ[created_col] = 'TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP'
|
|
2691
|
-
# 更新时间:插入和更新时都自动设置为当前时间
|
|
2692
|
-
updated_set_typ[updated_col] = 'TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP'
|
|
810
|
+
total_success = True
|
|
2693
811
|
|
|
2694
|
-
|
|
2695
|
-
|
|
2696
|
-
import pandas as pd
|
|
2697
|
-
df = data.copy()
|
|
2698
|
-
|
|
2699
|
-
# 移除原始数据中可能存在的时间戳列,让MySQL自动处理
|
|
2700
|
-
columns_to_remove = []
|
|
2701
|
-
for col in df.columns:
|
|
2702
|
-
if col in [created_col, updated_col]:
|
|
2703
|
-
columns_to_remove.append(col)
|
|
812
|
+
for partition_suffix, partition_data in partitioned_data.items():
|
|
813
|
+
partition_table_name = f"{base_table_name}_{partition_suffix}"
|
|
2704
814
|
|
|
2705
|
-
|
|
2706
|
-
|
|
815
|
+
success = self._handle_single_table_upload(
|
|
816
|
+
db_name, partition_table_name, partition_data, set_typ,
|
|
817
|
+
allow_null, update_on_duplicate, unique_keys
|
|
818
|
+
)
|
|
2707
819
|
|
|
2708
|
-
|
|
2709
|
-
|
|
820
|
+
if not success:
|
|
821
|
+
total_success = False
|
|
822
|
+
|
|
823
|
+
logger.info('分表上传完成', {
|
|
824
|
+
'数据库': db_name,
|
|
825
|
+
'基础表名': base_table_name,
|
|
826
|
+
'分区数': len(partitioned_data),
|
|
827
|
+
'总体成功': total_success
|
|
828
|
+
})
|
|
2710
829
|
|
|
2711
|
-
|
|
2712
|
-
|
|
2713
|
-
|
|
2714
|
-
|
|
2715
|
-
|
|
2716
|
-
|
|
2717
|
-
|
|
2718
|
-
|
|
2719
|
-
|
|
2720
|
-
|
|
2721
|
-
|
|
2722
|
-
|
|
2723
|
-
|
|
2724
|
-
|
|
2725
|
-
|
|
2726
|
-
|
|
2727
|
-
|
|
2728
|
-
|
|
2729
|
-
new_row[key] = value
|
|
2730
|
-
|
|
2731
|
-
# 不再手动添加时间戳,让MySQL的CURRENT_TIMESTAMP自动处理
|
|
2732
|
-
processed_data.append(new_row)
|
|
2733
|
-
|
|
2734
|
-
# 如果原始数据是单个字典,返回单个字典
|
|
2735
|
-
if is_single_dict:
|
|
2736
|
-
return processed_data[0], updated_set_typ
|
|
2737
|
-
else:
|
|
2738
|
-
return processed_data, updated_set_typ
|
|
2739
|
-
|
|
830
|
+
return total_success
|
|
831
|
+
|
|
832
|
+
def close(self):
|
|
833
|
+
"""关闭连接"""
|
|
834
|
+
if self.conn_mgr:
|
|
835
|
+
self.conn_mgr.close()
|
|
836
|
+
|
|
837
|
+
def __del__(self):
|
|
838
|
+
try:
|
|
839
|
+
self.close()
|
|
840
|
+
except:
|
|
841
|
+
pass
|
|
842
|
+
|
|
843
|
+
def __enter__(self):
|
|
844
|
+
return self
|
|
845
|
+
|
|
846
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
847
|
+
self.close()
|
|
2740
848
|
|
|
2741
|
-
def main():
|
|
2742
|
-
dir_path = os.path.expanduser("~")
|
|
2743
|
-
parser = myconf.ConfigParser()
|
|
2744
|
-
host, port, username, password = parser.get_section_values(
|
|
2745
|
-
file_path=os.path.join(dir_path, 'spd.txt'),
|
|
2746
|
-
section='mysql',
|
|
2747
|
-
keys=['host', 'port', 'username', 'password'],
|
|
2748
|
-
)
|
|
2749
|
-
host = 'localhost'
|
|
2750
849
|
|
|
850
|
+
# 使用示例
|
|
851
|
+
if __name__ == '__main__':
|
|
852
|
+
# 示例代码
|
|
2751
853
|
uploader = MySQLUploader(
|
|
2752
|
-
username=
|
|
2753
|
-
password=
|
|
2754
|
-
host=
|
|
2755
|
-
port=
|
|
854
|
+
username='your_username',
|
|
855
|
+
password='your_password',
|
|
856
|
+
host='localhost',
|
|
857
|
+
port=3306
|
|
2756
858
|
)
|
|
2757
|
-
|
|
2758
|
-
#
|
|
2759
|
-
|
|
859
|
+
|
|
860
|
+
# 示例数据
|
|
861
|
+
sample_data = [
|
|
862
|
+
{'name': 'Alice', 'age': 25, 'salary': 50000.0, '日期': '2023-01-01'},
|
|
863
|
+
{'name': 'Bob', 'age': 30, 'salary': 60000.0, '日期': '2023-01-02'},
|
|
864
|
+
]
|
|
865
|
+
|
|
866
|
+
# 定义列类型(系统会自动添加id、create_at、update_at列)
|
|
867
|
+
column_types = {
|
|
2760
868
|
'name': 'VARCHAR(255)',
|
|
2761
869
|
'age': 'INT',
|
|
2762
870
|
'salary': 'DECIMAL(10,2)',
|
|
2763
|
-
'日期': 'DATE'
|
|
2764
|
-
'shop': None,
|
|
871
|
+
'日期': 'DATE'
|
|
2765
872
|
}
|
|
2766
|
-
|
|
2767
|
-
# 准备数据
|
|
2768
|
-
data = [
|
|
2769
|
-
{'日期': '2023-01-8', 'name': 'JACk', 'AGE': '24', 'salary': 555.1545},
|
|
2770
|
-
{'日期': '2023-01-15', 'name': 'Alice', 'AGE': 35, 'salary': '100'},
|
|
2771
|
-
{'日期': '2023-01-15', 'name': 'Alice', 'AGE': 5, 'salary': 15478},
|
|
2772
|
-
{'日期': '2023-02-20', 'name': 'Bob', 'AGE': 25, 'salary': 45000.75},
|
|
2773
|
-
]
|
|
2774
|
-
|
|
2775
|
-
# 测试参数验证功能
|
|
2776
|
-
print("=== 测试参数验证功能 ===")
|
|
2777
|
-
|
|
2778
|
-
# 正确的格式
|
|
2779
|
-
print("1. 测试正确的unique_keys格式:")
|
|
2780
|
-
try:
|
|
2781
|
-
valid_unique_keys = [['日期', 'name'], ['age']]
|
|
2782
|
-
result = uploader._validate_unique_keys_format(valid_unique_keys, 'test_db', 'test_table')
|
|
2783
|
-
print(f" 通过: {result}")
|
|
2784
|
-
except Exception as e:
|
|
2785
|
-
print(f" 失败: {e}")
|
|
2786
|
-
|
|
2787
|
-
# 错误的格式 - 缺少一层嵌套
|
|
2788
|
-
print("2. 测试错误的unique_keys格式 (缺少嵌套):")
|
|
2789
|
-
try:
|
|
2790
|
-
invalid_unique_keys = ['日期', 'name'] # 错误:应该是 [['日期', 'name']]
|
|
2791
|
-
result = uploader._validate_unique_keys_format(invalid_unique_keys, 'test_db', 'test_table')
|
|
2792
|
-
print(f" 通过: {result}")
|
|
2793
|
-
except Exception as e:
|
|
2794
|
-
print(f" 正确捕获错误: {e}")
|
|
2795
873
|
|
|
2796
|
-
#
|
|
2797
|
-
|
|
2798
|
-
|
|
2799
|
-
|
|
2800
|
-
|
|
2801
|
-
|
|
2802
|
-
|
|
2803
|
-
|
|
2804
|
-
|
|
2805
|
-
# 错误的格式 - 空字符串
|
|
2806
|
-
print("4. 测试错误的unique_keys格式 (空字符串):")
|
|
2807
|
-
try:
|
|
2808
|
-
invalid_unique_keys = [['日期', '']] # 错误:空字符串
|
|
2809
|
-
result = uploader._validate_unique_keys_format(invalid_unique_keys, 'test_db', 'test_table')
|
|
2810
|
-
print(f" 通过: {result}")
|
|
2811
|
-
except Exception as e:
|
|
2812
|
-
print(f" 正确捕获错误: {e}")
|
|
2813
|
-
|
|
2814
|
-
# 错误的格式 - 重复列名
|
|
2815
|
-
print("5. 测试错误的unique_keys格式 (重复列名):")
|
|
2816
|
-
try:
|
|
2817
|
-
invalid_unique_keys = [['日期', '日期']] # 错误:重复列名
|
|
2818
|
-
result = uploader._validate_unique_keys_format(invalid_unique_keys, 'test_db', 'test_table')
|
|
2819
|
-
print(f" 通过: {result}")
|
|
2820
|
-
except Exception as e:
|
|
2821
|
-
print(f" 正确捕获错误: {e}")
|
|
2822
|
-
|
|
2823
|
-
# 空值测试 - 空列表
|
|
2824
|
-
print("6. 测试空值情况 - 空列表:")
|
|
2825
|
-
try:
|
|
2826
|
-
empty_list = []
|
|
2827
|
-
result = uploader._validate_unique_keys_format(empty_list, 'test_db', 'test_table')
|
|
2828
|
-
print(f" 通过: {result}")
|
|
2829
|
-
except Exception as e:
|
|
2830
|
-
print(f" 失败: {e}")
|
|
2831
|
-
|
|
2832
|
-
# 空值测试 - 包含空列表
|
|
2833
|
-
print("7. 测试空值情况 - 包含空列表 [[]]:")
|
|
2834
|
-
try:
|
|
2835
|
-
empty_nested = [[]]
|
|
2836
|
-
result = uploader._validate_unique_keys_format(empty_nested, 'test_db', 'test_table')
|
|
2837
|
-
print(f" 通过: {result}")
|
|
2838
|
-
except Exception as e:
|
|
2839
|
-
print(f" 失败: {e}")
|
|
2840
|
-
|
|
2841
|
-
# 空值测试 - 混合空列表和有效列表
|
|
2842
|
-
print("8. 测试空值情况 - 混合空列表和有效列表 [[], ['col1']]:")
|
|
2843
|
-
try:
|
|
2844
|
-
mixed_empty = [[], ['col1']]
|
|
2845
|
-
result = uploader._validate_unique_keys_format(mixed_empty, 'test_db', 'test_table')
|
|
2846
|
-
print(f" 通过: {result}")
|
|
2847
|
-
except Exception as e:
|
|
2848
|
-
print(f" 失败: {e}")
|
|
2849
|
-
|
|
2850
|
-
# 空值测试 - 包含空字符串的列表
|
|
2851
|
-
print("9. 测试空值情况 - 包含空字符串的列表 [[''], ['col1']]:")
|
|
2852
|
-
try:
|
|
2853
|
-
empty_string_list = [[''], ['col1']]
|
|
2854
|
-
result = uploader._validate_unique_keys_format(empty_string_list, 'test_db', 'test_table')
|
|
2855
|
-
print(f" 通过: {result}")
|
|
2856
|
-
except Exception as e:
|
|
2857
|
-
print(f" 正确捕获错误: {e}")
|
|
2858
|
-
|
|
2859
|
-
# 空值测试 - 包含纯空白字符的列表
|
|
2860
|
-
print("10. 测试空值情况 - 包含纯空白字符的列表 [[' '], ['col1']]:")
|
|
2861
|
-
try:
|
|
2862
|
-
whitespace_list = [[' '], ['col1']]
|
|
2863
|
-
result = uploader._validate_unique_keys_format(whitespace_list, 'test_db', 'test_table')
|
|
2864
|
-
print(f" 通过: {result}")
|
|
2865
|
-
except Exception as e:
|
|
2866
|
-
print(f" 正确捕获错误: {e}")
|
|
2867
|
-
|
|
2868
|
-
# 测试indexes的空值处理
|
|
2869
|
-
print("\n=== 测试indexes空值处理 ===")
|
|
2870
|
-
print("11. 测试indexes包含空字符串 ['', 'col1']:")
|
|
2871
|
-
try:
|
|
2872
|
-
indexes_with_empty = ['', 'col1']
|
|
2873
|
-
result = uploader._validate_indexes_format(indexes_with_empty, 'test_db', 'test_table')
|
|
2874
|
-
print(f" 通过: {result}")
|
|
2875
|
-
except Exception as e:
|
|
2876
|
-
print(f" 失败: {e}")
|
|
2877
|
-
|
|
2878
|
-
# 测试primary_keys的空值处理
|
|
2879
|
-
print("12. 测试primary_keys包含空字符串 ['', 'col1']:")
|
|
2880
|
-
try:
|
|
2881
|
-
primary_keys_with_empty = ['', 'col1']
|
|
2882
|
-
result = uploader._validate_primary_keys_format(primary_keys_with_empty, 'test_db', 'test_table')
|
|
2883
|
-
print(f" 通过: {result}")
|
|
2884
|
-
except Exception as e:
|
|
2885
|
-
print(f" 失败: {e}")
|
|
2886
|
-
|
|
2887
|
-
# 上传数据(使用正确的格式)
|
|
2888
|
-
print("\n=== 开始上传数据 ===")
|
|
2889
|
-
uploader.upload_data(
|
|
2890
|
-
db_name='测试库',
|
|
2891
|
-
table_name='测试表',
|
|
2892
|
-
data=data,
|
|
2893
|
-
set_typ=set_typ, # 定义列和数据类型
|
|
2894
|
-
primary_keys=[], # 创建唯一主键
|
|
2895
|
-
check_duplicate=False, # 检查重复数据
|
|
2896
|
-
duplicate_columns=[], # 指定排重的组合键
|
|
2897
|
-
update_on_duplicate=True, # 更新旧数据
|
|
2898
|
-
allow_null=False, # 允许插入空值
|
|
2899
|
-
partition_by='year', # 分表方式
|
|
2900
|
-
partition_date_column='日期', # 用于分表的日期列名,默认为'日期'
|
|
2901
|
-
indexes=[], # 普通索引列
|
|
2902
|
-
transaction_mode='row', # 事务模式
|
|
2903
|
-
unique_keys=[['日期', 'name', 'age']], # 唯一约束列表 - 正确的格式
|
|
874
|
+
# 上传数据
|
|
875
|
+
success = uploader.upload_data(
|
|
876
|
+
db_name='test_db',
|
|
877
|
+
table_name='test_table',
|
|
878
|
+
data=sample_data,
|
|
879
|
+
set_typ=column_types,
|
|
880
|
+
allow_null=False,
|
|
881
|
+
update_on_duplicate=True,
|
|
882
|
+
unique_keys=[['name', '日期']]
|
|
2904
883
|
)
|
|
2905
|
-
|
|
884
|
+
|
|
2906
885
|
uploader.close()
|
|
2907
|
-
|
|
2908
|
-
|
|
2909
|
-
if __name__ == '__main__':
|
|
2910
|
-
# main()
|
|
2911
|
-
pass
|
|
886
|
+
print(f"上传结果: {success}")
|