mdbq 3.9.1__py3-none-any.whl → 3.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/mysql.py +1910 -1698
- {mdbq-3.9.1.dist-info → mdbq-3.9.2.dist-info}/METADATA +1 -1
- {mdbq-3.9.1.dist-info → mdbq-3.9.2.dist-info}/RECORD +6 -6
- {mdbq-3.9.1.dist-info → mdbq-3.9.2.dist-info}/WHEEL +0 -0
- {mdbq-3.9.1.dist-info → mdbq-3.9.2.dist-info}/top_level.txt +0 -0
mdbq/mysql/mysql.py
CHANGED
@@ -10,9 +10,14 @@ import pandas as pd
|
|
10
10
|
from sqlalchemy import create_engine
|
11
11
|
import os
|
12
12
|
import logging
|
13
|
+
import logging.handlers
|
13
14
|
from mdbq.other import otk
|
14
|
-
from dbutils.pooled_db import PooledDB
|
15
15
|
from typing import Union, List, Dict, Optional, Any, Tuple
|
16
|
+
from dbutils.pooled_db import PooledDB
|
17
|
+
import json
|
18
|
+
import psutil # 用于监控资源使用情况
|
19
|
+
|
20
|
+
|
16
21
|
warnings.filterwarnings('ignore')
|
17
22
|
"""
|
18
23
|
建表流程:
|
@@ -44,917 +49,427 @@ def count_decimal_places(num_str):
|
|
44
49
|
return 0, 0
|
45
50
|
|
46
51
|
|
47
|
-
class
|
48
|
-
def __init__(
|
49
|
-
self,
|
50
|
-
username: str,
|
51
|
-
password: str,
|
52
|
-
host: str = 'localhost',
|
53
|
-
port: int = 3306,
|
54
|
-
charset: str = 'utf8mb4',
|
55
|
-
collation: str = 'utf8mb4_0900_ai_ci',
|
56
|
-
enable_logging: bool = False,
|
57
|
-
log_level: str = 'ERROR',
|
58
|
-
max_retries: int = 10,
|
59
|
-
retry_interval: int = 10,
|
60
|
-
pool_size: int = 5,
|
61
|
-
connect_timeout: int = 10,
|
62
|
-
read_timeout: int = 30,
|
63
|
-
write_timeout: int = 30,
|
64
|
-
ssl: Optional[Dict] = None
|
65
|
-
):
|
66
|
-
"""
|
67
|
-
初始化MySQL上传工具
|
68
|
-
|
69
|
-
:param username: 数据库用户名
|
70
|
-
:param password: 数据库密码
|
71
|
-
:param host: 数据库主机地址,默认为localhost
|
72
|
-
:param port: 数据库端口,默认为3306
|
73
|
-
:param charset: 字符集,默认为utf8mb4
|
74
|
-
:param collation: 排序规则,默认为utf8mb4_0900_ai_ci
|
75
|
-
:param enable_logging: 是否启用日志,默认为False
|
76
|
-
:param log_level: 日志级别,默认为ERROR
|
77
|
-
:param max_retries: 最大重试次数,默认为10
|
78
|
-
:param retry_interval: 重试间隔(秒),默认为10
|
79
|
-
:param pool_size: 连接池大小,默认为5
|
80
|
-
:param connect_timeout: 连接超时(秒),默认为10
|
81
|
-
:param read_timeout: 读取超时(秒),默认为30
|
82
|
-
:param write_timeout: 写入超时(秒),默认为30
|
83
|
-
:param ssl: SSL配置字典,默认为None
|
84
|
-
"""
|
52
|
+
class MysqlUpload:
|
53
|
+
def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
|
85
54
|
self.username = username
|
86
55
|
self.password = password
|
87
56
|
self.host = host
|
88
57
|
self.port = port
|
89
|
-
|
90
|
-
|
91
|
-
self.max_retries = max(max_retries, 1) # 至少重试1次
|
92
|
-
self.retry_interval = max(retry_interval, 1) # 至少间隔1秒
|
93
|
-
self.pool_size = max(pool_size, 1) # 至少1个连接
|
94
|
-
self.connect_timeout = connect_timeout
|
95
|
-
self.read_timeout = read_timeout
|
96
|
-
self.write_timeout = write_timeout
|
97
|
-
self.ssl = ssl
|
98
|
-
self._prepared_statements = {} # 预处理语句缓存
|
99
|
-
self._max_cached_statements = 100 # 最大缓存语句数
|
100
|
-
|
101
|
-
# 初始化日志
|
102
|
-
if enable_logging:
|
103
|
-
self._init_logging(log_level)
|
58
|
+
if username == '' or password == '' or host == '' or port == 0:
|
59
|
+
self.config = None
|
104
60
|
else:
|
105
|
-
self.
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
valid_levels = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
|
113
|
-
level = log_level.upper() if log_level.upper() in valid_levels else 'ERROR'
|
114
|
-
|
115
|
-
logging.basicConfig(
|
116
|
-
level=getattr(logging, level),
|
117
|
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
118
|
-
handlers=[logging.StreamHandler()]
|
119
|
-
)
|
120
|
-
self.logger = logging.getLogger('MySQLUploader')
|
121
|
-
|
122
|
-
def _create_connection_pool(self) -> PooledDB:
|
123
|
-
"""创建数据库连接池"""
|
124
|
-
pool_params = {
|
125
|
-
'creator': pymysql,
|
126
|
-
'host': self.host,
|
127
|
-
'port': self.port,
|
128
|
-
'user': self.username,
|
129
|
-
'password': self.password,
|
130
|
-
'charset': self.charset,
|
131
|
-
'cursorclass': pymysql.cursors.DictCursor,
|
132
|
-
'maxconnections': self.pool_size,
|
133
|
-
'ping': 7, # 连接检查
|
134
|
-
'connect_timeout': self.connect_timeout,
|
135
|
-
'read_timeout': self.read_timeout,
|
136
|
-
'write_timeout': self.write_timeout,
|
137
|
-
'autocommit': False
|
138
|
-
}
|
139
|
-
|
140
|
-
if self.ssl:
|
141
|
-
required_keys = {'ca', 'cert', 'key'}
|
142
|
-
if not all(k in self.ssl for k in required_keys):
|
143
|
-
raise ValueError("SSL配置必须包含ca、cert和key")
|
144
|
-
pool_params['ssl'] = {
|
145
|
-
'ca': self.ssl['ca'],
|
146
|
-
'cert': self.ssl['cert'],
|
147
|
-
'key': self.ssl['key'],
|
148
|
-
'check_hostname': self.ssl.get('check_hostname', False)
|
61
|
+
self.config = {
|
62
|
+
'host': self.host,
|
63
|
+
'port': int(self.port),
|
64
|
+
'user': self.username,
|
65
|
+
'password': self.password,
|
66
|
+
'charset': charset, # utf8mb4 支持存储四字节的UTF-8字符集
|
67
|
+
'cursorclass': pymysql.cursors.DictCursor,
|
149
68
|
}
|
69
|
+
self.filename = None
|
150
70
|
|
151
|
-
|
152
|
-
|
153
|
-
return pool
|
154
|
-
except Exception as e:
|
155
|
-
if self.logger:
|
156
|
-
self.logger.error("连接池创建失败: %s", str(e))
|
157
|
-
raise ConnectionError(f"连接池创建失败: {str(e)}")
|
71
|
+
@staticmethod
|
72
|
+
def try_except(func): # 在类内部定义一个异常处理方法
|
158
73
|
|
159
|
-
|
160
|
-
|
161
|
-
'%Y-%m-%d %H:%M:%S',
|
162
|
-
'%Y-%m-%d',
|
163
|
-
'%Y/%m/%d %H:%M:%S',
|
164
|
-
'%Y/%m/%d',
|
165
|
-
'%Y%m%d',
|
166
|
-
'%Y-%m-%dT%H:%M:%S', # ISO格式
|
167
|
-
'%Y-%m-%d %H:%M:%S.%f' # 带毫秒
|
168
|
-
]
|
169
|
-
for fmt in formats:
|
74
|
+
@wraps(func)
|
75
|
+
def wrapper(*args, **kwargs):
|
170
76
|
try:
|
171
|
-
return
|
172
|
-
except
|
173
|
-
|
174
|
-
raise ValueError(f"无效的日期格式: {value}")
|
175
|
-
|
176
|
-
def _validate_identifier(self, identifier: str) -> str:
|
177
|
-
"""
|
178
|
-
验证并清理数据库标识符(数据库名、表名、列名)
|
179
|
-
防止SQL注入和非法字符
|
180
|
-
|
181
|
-
:param identifier: 要验证的标识符
|
182
|
-
:return: 清理后的安全标识符
|
183
|
-
:raises ValueError: 如果标识符无效
|
184
|
-
"""
|
185
|
-
if not identifier or not isinstance(identifier, str):
|
186
|
-
error_msg = f"无效的标识符: {identifier}"
|
187
|
-
if self.logger:
|
188
|
-
self.logger.error(error_msg)
|
189
|
-
raise ValueError(error_msg)
|
190
|
-
|
191
|
-
# 移除可能有害的字符,只保留字母、数字、下划线和美元符号
|
192
|
-
cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '', identifier)
|
193
|
-
if not cleaned:
|
194
|
-
error_msg = f"无法清理异常标识符: {identifier}"
|
195
|
-
if self.logger:
|
196
|
-
self.logger.error(error_msg)
|
197
|
-
raise ValueError(error_msg)
|
198
|
-
|
199
|
-
# 检查是否为MySQL保留字
|
200
|
-
mysql_keywords = {
|
201
|
-
'select', 'insert', 'update', 'delete', 'from', 'where', 'and', 'or',
|
202
|
-
'not', 'like', 'in', 'is', 'null', 'true', 'false', 'between'
|
203
|
-
}
|
204
|
-
if cleaned.lower() in mysql_keywords:
|
205
|
-
if self.logger:
|
206
|
-
self.logger.warning("存在MySQL保留字: %s", cleaned)
|
207
|
-
return f"`{cleaned}`"
|
208
|
-
|
209
|
-
return cleaned
|
77
|
+
return func(*args, **kwargs)
|
78
|
+
except Exception as e:
|
79
|
+
logger.error(f'{func.__name__}, {e}') # 将异常信息返回
|
210
80
|
|
211
|
-
|
212
|
-
"""
|
213
|
-
验证并清理数据值,根据列类型进行适当转换
|
81
|
+
return wrapper
|
214
82
|
|
215
|
-
|
216
|
-
|
217
|
-
:
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
83
|
+
def keep_connect(self, _db_name, _config, max_try: int=10):
|
84
|
+
attempts = 1
|
85
|
+
while attempts <= max_try:
|
86
|
+
try:
|
87
|
+
connection = pymysql.connect(**_config) # 连接数据库
|
88
|
+
return connection
|
89
|
+
except Exception as e:
|
90
|
+
logger.error(f'{_db_name}: 连接失败,正在重试: {self.host}:{self.port} {attempts}/{max_try} {e}')
|
91
|
+
attempts += 1
|
92
|
+
time.sleep(30)
|
93
|
+
logger.error(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
|
94
|
+
return None
|
222
95
|
|
223
|
-
|
224
|
-
|
96
|
+
def cover_doc_dtypes(self, dict_data):
|
97
|
+
""" 清理字典键值 并转换数据类型 """
|
98
|
+
if not dict_data:
|
99
|
+
logger.info(f'mysql.py -> MysqlUpload -> cover_dict_dtypes -> 传入的字典不能为空')
|
100
|
+
return
|
101
|
+
__res_dict = {}
|
102
|
+
new_dict_data = {}
|
103
|
+
for k, v in dict_data.items():
|
104
|
+
k = str(k).lower()
|
105
|
+
k = re.sub(r'[()\-,,$&~^、 ()\"\'“”=·/。》《><!!`]', '_', k, re.IGNORECASE)
|
106
|
+
k = k.replace(')', '')
|
107
|
+
k = re.sub(r'_{2,}', '_', k)
|
108
|
+
k = re.sub(r'_+$', '', k)
|
109
|
+
result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
|
110
|
+
result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
|
111
|
+
result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
|
112
|
+
result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
|
225
113
|
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
elif
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
114
|
+
date_type = otk.is_valid_date(v) # 判断日期时间
|
115
|
+
int_num = otk.is_integer(v) # 判断整数
|
116
|
+
count_int, count_float = count_decimal_places(v) # 判断小数,返回小数位数
|
117
|
+
if result1: # 京东sku/spu商品信息
|
118
|
+
__res_dict.update({k: 'varchar(100)'})
|
119
|
+
elif k == '日期':
|
120
|
+
__res_dict.update({k: 'DATE'})
|
121
|
+
elif k == '更新时间':
|
122
|
+
__res_dict.update({k: 'TIMESTAMP'})
|
123
|
+
elif result2: # 小数
|
124
|
+
__res_dict.update({k: 'decimal(10,4)'})
|
125
|
+
elif date_type == 1: # 纯日期
|
126
|
+
__res_dict.update({k: 'DATE'})
|
127
|
+
elif date_type == 2: # 日期+时间
|
128
|
+
__res_dict.update({k: 'DATETIME'})
|
129
|
+
elif int_num:
|
130
|
+
__res_dict.update({k: 'INT'})
|
131
|
+
elif count_float > 0:
|
132
|
+
if count_int + count_float > 10:
|
133
|
+
if count_float >= 6:
|
134
|
+
__res_dict.update({k: 'decimal(14,6)'})
|
135
|
+
else:
|
136
|
+
__res_dict.update({k: 'decimal(14,4)'})
|
137
|
+
elif count_float >= 6:
|
138
|
+
__res_dict.update({k: 'decimal(14,6)'})
|
139
|
+
elif count_float >= 4:
|
140
|
+
__res_dict.update({k: 'decimal(12,4)'})
|
141
|
+
else:
|
142
|
+
__res_dict.update({k: 'decimal(10,2)'})
|
247
143
|
else:
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
self.logger.error(error_msg)
|
253
|
-
raise ValueError(error_msg)
|
144
|
+
__res_dict.update({k: 'varchar(255)'})
|
145
|
+
new_dict_data.update({k: v})
|
146
|
+
__res_dict.update({'数据主体': 'longblob'})
|
147
|
+
return __res_dict, new_dict_data
|
254
148
|
|
255
|
-
|
149
|
+
@try_except
|
150
|
+
def insert_many_dict(self, db_name, table_name, dict_data_list, icm_update=None, index_length=100, set_typ=None, allow_not_null=False, cut_data=None):
|
256
151
|
"""
|
257
|
-
|
258
|
-
|
259
|
-
:
|
260
|
-
:
|
261
|
-
:
|
262
|
-
:
|
263
|
-
:raises Exception: 如果所有重试都失败
|
152
|
+
插入字典数据
|
153
|
+
dict_data: 字典
|
154
|
+
index_length: 索引长度
|
155
|
+
icm_update: 增量更正
|
156
|
+
set_typ: {}
|
157
|
+
allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
|
264
158
|
"""
|
159
|
+
if not self.config:
|
160
|
+
return
|
265
161
|
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
162
|
+
if not dict_data_list:
|
163
|
+
logger.info(f'dict_data_list 不能为空 ')
|
164
|
+
return
|
165
|
+
dict_data = dict_data_list[0]
|
166
|
+
if cut_data:
|
167
|
+
if '日期' in dict_data.keys():
|
270
168
|
try:
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
if attempt < self.max_retries - 1:
|
278
|
-
wait_time = self.retry_interval * (attempt + 1)
|
279
|
-
if self.logger:
|
280
|
-
self.logger.warning(
|
281
|
-
"尝试 %d/%d 失败: %s. %d秒后重试...",
|
282
|
-
attempt + 1, self.max_retries, str(e), wait_time
|
283
|
-
)
|
284
|
-
time.sleep(wait_time)
|
285
|
-
# 尝试重新连接
|
286
|
-
try:
|
287
|
-
self.pool = self._create_connection_pool()
|
288
|
-
except Exception as reconnect_error:
|
289
|
-
if self.logger:
|
290
|
-
self.logger.error("重连失败: %s", str(reconnect_error))
|
291
|
-
continue
|
169
|
+
__y = pd.to_datetime(dict_data['日期']).strftime('%Y')
|
170
|
+
__y_m = pd.to_datetime(dict_data['日期']).strftime('%Y-%m')
|
171
|
+
if str(cut_data).lower() == 'year':
|
172
|
+
table_name = f'{table_name}_{__y}'
|
173
|
+
elif str(cut_data).lower() == 'month':
|
174
|
+
table_name = f'{table_name}_{__y_m}'
|
292
175
|
else:
|
293
|
-
|
294
|
-
self.logger.error(
|
295
|
-
"Operation failed after %d attempts. Last error: %s",
|
296
|
-
self.max_retries, str(e)
|
297
|
-
)
|
298
|
-
except pymysql.IntegrityError as e:
|
299
|
-
# 完整性错误通常不需要重试
|
300
|
-
if self.logger:
|
301
|
-
self.logger.error("完整性约束错误: %s", str(e))
|
302
|
-
raise e
|
176
|
+
logger.info(f'参数不正确,cut_data应为 year 或 month ')
|
303
177
|
except Exception as e:
|
304
|
-
|
305
|
-
if self.logger:
|
306
|
-
self.logger.error("发生意外错误: %s", str(e))
|
307
|
-
break
|
308
|
-
|
309
|
-
raise last_exception if last_exception else Exception("发生未知错误")
|
310
|
-
|
311
|
-
return wrapper(*args, **kwargs)
|
178
|
+
logger.error(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
|
312
179
|
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
180
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
181
|
+
if not connection:
|
182
|
+
return
|
183
|
+
with connection.cursor() as cursor:
|
184
|
+
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
185
|
+
database_exists = cursor.fetchone()
|
186
|
+
if not database_exists:
|
187
|
+
# 如果数据库不存在,则新建
|
188
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
189
|
+
cursor.execute(sql)
|
190
|
+
connection.commit()
|
191
|
+
logger.info(f"创建Database: {db_name}")
|
324
192
|
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
193
|
+
self.config.update({'database': db_name}) # 添加更新 config 字段
|
194
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
195
|
+
if not connection:
|
196
|
+
return
|
197
|
+
with connection.cursor() as cursor:
|
198
|
+
# 1. 查询表, 不存在则创建一个空表
|
199
|
+
sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
|
200
|
+
cursor.execute(sql, (table_name,))
|
201
|
+
if not cursor.fetchone():
|
202
|
+
sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
|
203
|
+
cursor.execute(sql)
|
204
|
+
logger.info(f'创建 mysql 表: {table_name}')
|
329
205
|
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
return exists
|
338
|
-
except Exception as e:
|
339
|
-
if self.logger:
|
340
|
-
self.logger.error("检查数据库是否存在时出错: %s", str(e))
|
341
|
-
raise
|
206
|
+
# 根据 dict_data 的值添加指定的数据类型
|
207
|
+
dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
|
208
|
+
if set_typ:
|
209
|
+
# 更新自定义的列数据类型
|
210
|
+
for k, v in dtypes.copy().items():
|
211
|
+
# 确保传进来的 set_typ 键存在于实际的 df 列才 update
|
212
|
+
[dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
|
342
213
|
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
214
|
+
# 检查列
|
215
|
+
sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
|
216
|
+
cursor.execute(sql, (db_name, table_name))
|
217
|
+
col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()] # 已存在的所有列
|
218
|
+
col_not_exist = [col for col in dict_data.keys() if col not in col_exist] # 不存在的列
|
219
|
+
# 不存在则新建列
|
220
|
+
if col_not_exist: # 数据表中不存在的列
|
221
|
+
for col in col_not_exist:
|
222
|
+
# 创建列,需转义
|
223
|
+
if allow_not_null:
|
224
|
+
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]};"
|
225
|
+
else:
|
226
|
+
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
|
347
227
|
|
348
|
-
try:
|
349
|
-
with self._get_connection() as conn:
|
350
|
-
with conn.cursor() as cursor:
|
351
228
|
cursor.execute(sql)
|
352
|
-
|
353
|
-
if self.logger:
|
354
|
-
self.logger.info("数据库 %s 创建成功", db_name)
|
355
|
-
except Exception as e:
|
356
|
-
if self.logger:
|
357
|
-
self.logger.error("无法创建数据库 %s: %s", db_name, str(e))
|
358
|
-
conn.rollback()
|
359
|
-
raise
|
229
|
+
logger.info(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
|
360
230
|
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
sql = """
|
366
|
-
SELECT TABLE_NAME
|
367
|
-
FROM INFORMATION_SCHEMA.TABLES
|
368
|
-
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
369
|
-
"""
|
231
|
+
if col == '日期':
|
232
|
+
sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
|
233
|
+
logger.info(f"设置为索引: {col}({dtypes[col]})")
|
234
|
+
cursor.execute(sql)
|
370
235
|
|
371
|
-
|
372
|
-
|
373
|
-
|
236
|
+
connection.commit() # 提交事务
|
237
|
+
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
238
|
+
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
239
|
+
# 处理插入的数据
|
240
|
+
for dict_data in dict_data_list:
|
241
|
+
dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
|
242
|
+
if icm_update:
|
243
|
+
""" 使用增量更新: 需确保 icm_update['主键'] 传进来的列组合是数据表中唯一,值不会发生变化且不会重复,否则可能产生覆盖 """
|
244
|
+
sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
|
374
245
|
cursor.execute(sql, (db_name, table_name))
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
self.logger.error("检查数据表是否存在时发生未知错误: %s", str(e))
|
380
|
-
raise
|
381
|
-
|
382
|
-
def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
|
383
|
-
"""获取表的列名和数据类型"""
|
384
|
-
db_name = self._validate_identifier(db_name)
|
385
|
-
table_name = self._validate_identifier(table_name)
|
386
|
-
sql = """
|
387
|
-
SELECT COLUMN_NAME, DATA_TYPE
|
388
|
-
FROM INFORMATION_SCHEMA.COLUMNS
|
389
|
-
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
390
|
-
ORDER BY ORDINAL_POSITION
|
391
|
-
"""
|
246
|
+
columns = cursor.fetchall()
|
247
|
+
cols_exist = [col['COLUMN_NAME'] for col in columns] # 数据表的所有列, 返回 list
|
248
|
+
# 保留原始列名,不提前转义
|
249
|
+
raw_update_col = [item for item in cols_exist if item not in icm_update and item != 'id'] # 除了主键外的其他列
|
392
250
|
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
self.logger.debug("获取表 %s.%s 的列信息: %s", db_name, table_name, columns)
|
400
|
-
return columns
|
401
|
-
except Exception as e:
|
402
|
-
if self.logger:
|
403
|
-
self.logger.error("无法获取表列信息: %s", str(e))
|
404
|
-
raise
|
251
|
+
# 构建条件参数(使用原始列名)
|
252
|
+
condition_params = []
|
253
|
+
condition_parts = []
|
254
|
+
for up_col in icm_update:
|
255
|
+
condition_parts.append(f"`{up_col}` = %s") # SQL 转义
|
256
|
+
condition_params.append(dict_data[up_col]) # 原始列名用于访问数据
|
405
257
|
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
) -> List[Dict]:
|
412
|
-
"""
|
413
|
-
准备要上传的数据,验证并转换数据类型
|
258
|
+
# 动态转义列名生成 SQL 查询字段
|
259
|
+
escaped_update_col = [f'`{col}`' for col in raw_update_col]
|
260
|
+
sql = f"""SELECT {','.join(escaped_update_col)} FROM `{table_name}` WHERE {' AND '.join(condition_parts)}"""
|
261
|
+
cursor.execute(sql, condition_params)
|
262
|
+
results = cursor.fetchall()
|
414
263
|
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
data = data.replace({pd.NA: None}).to_dict('records')
|
425
|
-
except Exception as e:
|
426
|
-
if self.logger:
|
427
|
-
self.logger.error("Failed to convert DataFrame to dict: %s", str(e))
|
428
|
-
raise ValueError(f"Failed to convert DataFrame to dict: {str(e)}")
|
429
|
-
elif isinstance(data, dict):
|
430
|
-
data = [data]
|
431
|
-
elif not isinstance(data, list) or not all(isinstance(item, dict) for item in data):
|
432
|
-
error_msg = "Data must be a dict, list of dicts, or DataFrame"
|
433
|
-
if self.logger:
|
434
|
-
self.logger.error(error_msg)
|
435
|
-
raise ValueError(error_msg)
|
264
|
+
if results:
|
265
|
+
for result in results:
|
266
|
+
change_col = []
|
267
|
+
change_placeholders = []
|
268
|
+
set_params = []
|
269
|
+
for raw_col in raw_update_col:
|
270
|
+
# 使用原始列名访问数据
|
271
|
+
df_value = str(dict_data[raw_col])
|
272
|
+
mysql_value = str(result[raw_col])
|
436
273
|
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
if col_name.lower() == 'id':
|
443
|
-
continue
|
274
|
+
# 清理小数点后多余的零
|
275
|
+
if '.' in df_value:
|
276
|
+
df_value = re.sub(r'0+$', '', df_value).rstrip('.')
|
277
|
+
if '.' in mysql_value:
|
278
|
+
mysql_value = re.sub(r'0+$', '', mysql_value).rstrip('.')
|
444
279
|
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
self.logger.error(error_msg)
|
450
|
-
raise ValueError(error_msg)
|
451
|
-
prepared_row[col_name] = None
|
452
|
-
else:
|
453
|
-
try:
|
454
|
-
prepared_row[col_name] = self._validate_value(row[col_name], col_type)
|
455
|
-
except ValueError as e:
|
456
|
-
error_msg = f"Row {row_idx}, column '{col_name}': {str(e)}"
|
457
|
-
if self.logger:
|
458
|
-
self.logger.error(error_msg)
|
459
|
-
raise ValueError(error_msg)
|
460
|
-
prepared_data.append(prepared_row)
|
280
|
+
if df_value != mysql_value:
|
281
|
+
change_placeholders.append(f"`{raw_col}` = %s") # 动态转义列名
|
282
|
+
set_params.append(dict_data[raw_col])
|
283
|
+
change_col.append(raw_col)
|
461
284
|
|
462
|
-
|
463
|
-
|
464
|
-
|
285
|
+
if change_placeholders:
|
286
|
+
full_params = set_params + condition_params
|
287
|
+
sql = f"""UPDATE `{table_name}`
|
288
|
+
SET {','.join(change_placeholders)}
|
289
|
+
WHERE {' AND '.join(condition_parts)}"""
|
290
|
+
cursor.execute(sql, full_params)
|
291
|
+
else: # 没有数据返回,则直接插入数据
|
292
|
+
# 参数化插入
|
293
|
+
cols = ', '.join([f'`{k}`' for k in dict_data.keys()])
|
294
|
+
placeholders = ', '.join(['%s'] * len(dict_data))
|
295
|
+
sql = f"INSERT INTO `{table_name}` ({cols}) VALUES ({placeholders})"
|
296
|
+
cursor.execute(sql, tuple(dict_data.values()))
|
297
|
+
connection.commit() # 提交数据库
|
298
|
+
continue
|
465
299
|
|
466
|
-
|
467
|
-
|
468
|
-
|
300
|
+
# 标准插入逻辑(参数化修改)
|
301
|
+
# 构造更新列(排除主键)
|
302
|
+
update_cols = [k for k in dict_data.keys()]
|
303
|
+
# 构建SQL
|
304
|
+
cols = ', '.join([f'`{k}`' for k in dict_data.keys()])
|
305
|
+
placeholders = ', '.join(['%s'] * len(dict_data))
|
306
|
+
update_clause = ', '.join([f'`{k}` = VALUES(`{k}`)' for k in update_cols]) or 'id=id'
|
469
307
|
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
try:
|
480
|
-
date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d')
|
481
|
-
except ValueError:
|
482
|
-
error_msg = f"无效的日期格式: {date_value}"
|
483
|
-
if self.logger:
|
484
|
-
self.logger.error("无效的日期格式: %s", date_value)
|
485
|
-
raise ValueError(error_msg)
|
486
|
-
|
487
|
-
if partition_by == 'year':
|
488
|
-
return f"{table_name}_{date_obj.year}"
|
489
|
-
elif partition_by == 'month':
|
490
|
-
return f"{table_name}_{date_obj.year}_{date_obj.month:02d}"
|
491
|
-
else:
|
492
|
-
error_msg = "partition_by must be 'year' or 'month'"
|
493
|
-
if self.logger:
|
494
|
-
self.logger.error(error_msg)
|
495
|
-
raise ValueError(error_msg)
|
308
|
+
sql = f"""INSERT INTO `{table_name}` ({cols}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause}"""
|
309
|
+
# 执行参数化查询
|
310
|
+
try:
|
311
|
+
cursor.execute(sql, tuple(dict_data.values()))
|
312
|
+
connection.commit()
|
313
|
+
except pymysql.Error as e:
|
314
|
+
logger.error(f"插入失败: {e}\nSQL: {cursor.mogrify(sql, tuple(dict_data.values()))}")
|
315
|
+
connection.rollback()
|
316
|
+
connection.close()
|
496
317
|
|
497
|
-
|
498
|
-
|
499
|
-
db_name: str,
|
500
|
-
table_name: str,
|
501
|
-
columns: Dict[str, str],
|
502
|
-
primary_keys: Optional[List[str]] = None,
|
503
|
-
date_column: Optional[str] = None,
|
504
|
-
indexes: Optional[List[str]] = None,
|
505
|
-
unique_columns: Optional[List[str]] = None
|
506
|
-
):
|
318
|
+
# @try_except
|
319
|
+
def dict_to_mysql(self, db_name, table_name, dict_data, icm_update=None, index_length=100, set_typ=None, allow_not_null=False, cut_data=None):
|
507
320
|
"""
|
508
|
-
|
509
|
-
|
510
|
-
:
|
511
|
-
:
|
512
|
-
:
|
513
|
-
:
|
514
|
-
:param date_column: 日期列名,如果存在将设置为索引
|
515
|
-
:param indexes: 需要创建索引的列列表
|
516
|
-
:param unique_columns: 需要创建唯一索引的列列表
|
321
|
+
插入字典数据
|
322
|
+
dict_data: 字典
|
323
|
+
index_length: 索引长度
|
324
|
+
icm_update: 增量更新
|
325
|
+
set_typ: {}
|
326
|
+
allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
|
517
327
|
"""
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
if not columns:
|
522
|
-
error_msg = "No columns specified for table creation"
|
523
|
-
if self.logger:
|
524
|
-
self.logger.error(error_msg)
|
525
|
-
raise ValueError(error_msg)
|
526
|
-
|
527
|
-
# 构建列定义SQL
|
528
|
-
column_defs = ["`id` INT NOT NULL AUTO_INCREMENT"]
|
328
|
+
if not self.config:
|
329
|
+
return
|
529
330
|
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
331
|
+
if cut_data:
|
332
|
+
if '日期' in dict_data.keys():
|
333
|
+
try:
|
334
|
+
__y = pd.to_datetime(dict_data['日期']).strftime('%Y')
|
335
|
+
__y_m = pd.to_datetime(dict_data['日期']).strftime('%Y-%m')
|
336
|
+
if str(cut_data).lower() == 'year':
|
337
|
+
table_name = f'{table_name}_{__y}'
|
338
|
+
elif str(cut_data).lower() == 'month':
|
339
|
+
table_name = f'{table_name}_{__y_m}'
|
340
|
+
else:
|
341
|
+
logger.info(f'参数不正确,cut_data应为 year 或 month ')
|
342
|
+
except Exception as e:
|
343
|
+
logger.error(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
|
537
344
|
|
538
|
-
|
539
|
-
|
540
|
-
|
345
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
346
|
+
if not connection:
|
347
|
+
return
|
348
|
+
with connection.cursor() as cursor:
|
349
|
+
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
350
|
+
database_exists = cursor.fetchone()
|
351
|
+
if not database_exists:
|
352
|
+
# 如果数据库不存在,则新建
|
353
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
354
|
+
cursor.execute(sql)
|
355
|
+
connection.commit()
|
356
|
+
logger.info(f"创建Database: {db_name}")
|
541
357
|
|
542
|
-
|
358
|
+
self.config.update({'database': db_name}) # 添加更新 config 字段
|
359
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
360
|
+
if not connection:
|
361
|
+
return
|
362
|
+
with connection.cursor() as cursor:
|
363
|
+
# 1. 查询表, 不存在则创建一个空表
|
364
|
+
sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
|
365
|
+
cursor.execute(sql, (table_name,))
|
366
|
+
if not cursor.fetchone():
|
367
|
+
sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
|
368
|
+
cursor.execute(sql)
|
369
|
+
logger.info(f'创建 mysql 表: {table_name}')
|
543
370
|
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
primary_keys = ['id']
|
371
|
+
# 根据 dict_data 的值添加指定的数据类型
|
372
|
+
dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
|
373
|
+
if set_typ:
|
374
|
+
# 更新自定义的列数据类型
|
375
|
+
for k, v in dtypes.copy().items():
|
376
|
+
# 确保传进来的 set_typ 键存在于实际的 df 列才 update
|
377
|
+
[dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
|
552
378
|
|
553
|
-
|
554
|
-
|
555
|
-
|
379
|
+
# 检查列
|
380
|
+
sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
|
381
|
+
cursor.execute(sql, (db_name, table_name))
|
382
|
+
col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()] # 已存在的所有列
|
383
|
+
col_not_exist = [col for col in dict_data.keys() if col not in col_exist] # 不存在的列
|
384
|
+
# 不存在则新建列
|
385
|
+
if col_not_exist: # 数据表中不存在的列
|
386
|
+
for col in col_not_exist:
|
387
|
+
# 创建列,需转义
|
388
|
+
if allow_not_null:
|
389
|
+
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]};"
|
390
|
+
else:
|
391
|
+
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
|
392
|
+
cursor.execute(sql)
|
393
|
+
logger.info(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
|
556
394
|
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
395
|
+
if col == '日期':
|
396
|
+
sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
|
397
|
+
logger.info(f"设置为索引: {col}({dtypes[col]})")
|
398
|
+
cursor.execute(sql)
|
399
|
+
connection.commit() # 提交事务
|
400
|
+
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
401
|
+
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
402
|
+
# 处理插入的数据
|
403
|
+
if icm_update:
|
404
|
+
""" 使用增量更新: 需确保 icm_update['主键'] 传进来的列组合是数据表中唯一,值不会发生变化且不会重复,否则可能产生覆盖 """
|
405
|
+
sql = """SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s"""
|
406
|
+
cursor.execute(sql, (db_name, table_name))
|
407
|
+
cols_exist = [col['COLUMN_NAME'] for col in cursor.fetchall()] # 数据表的所有列, 返回 list
|
564
408
|
|
565
|
-
|
566
|
-
|
567
|
-
CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
|
568
|
-
{','.join(column_defs)}
|
569
|
-
{primary_key_sql}
|
570
|
-
{unique_index_sql}
|
571
|
-
) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
|
572
|
-
"""
|
409
|
+
# 保留原始列名,不提前转义
|
410
|
+
raw_update_col = [item for item in cols_exist if item not in icm_update and item != 'id']
|
573
411
|
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
412
|
+
# 构建条件参数(使用原始列名)
|
413
|
+
condition_params = []
|
414
|
+
condition_parts = []
|
415
|
+
for up_col in icm_update:
|
416
|
+
condition_parts.append(f"`{up_col}` = %s") # SQL 转义
|
417
|
+
condition_params.append(dict_data[up_col]) # 原始列名访问数据
|
580
418
|
|
581
|
-
#
|
582
|
-
|
419
|
+
# 动态转义列名生成 SQL 查询字段
|
420
|
+
escaped_update_col = [f'`{col}`' for col in raw_update_col]
|
421
|
+
sql = f"""SELECT {','.join(escaped_update_col)} FROM `{table_name}` WHERE {' AND '.join(condition_parts)}"""
|
422
|
+
cursor.execute(sql, condition_params)
|
423
|
+
results = cursor.fetchall()
|
583
424
|
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
425
|
+
if results:
|
426
|
+
for result in results:
|
427
|
+
change_col = []
|
428
|
+
change_placeholders = []
|
429
|
+
set_params = []
|
430
|
+
for raw_col in raw_update_col:
|
431
|
+
# 使用原始列名访问数据
|
432
|
+
df_value = str(dict_data[raw_col])
|
433
|
+
mysql_value = str(result[raw_col])
|
590
434
|
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
index_statements.append(
|
597
|
-
f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)"
|
598
|
-
)
|
435
|
+
# 清理小数点后多余的零
|
436
|
+
if '.' in df_value:
|
437
|
+
df_value = re.sub(r'0+$', '', df_value).rstrip('.')
|
438
|
+
if '.' in mysql_value:
|
439
|
+
mysql_value = re.sub(r'0+$', '', mysql_value).rstrip('.')
|
599
440
|
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
cursor.execute(stmt)
|
605
|
-
if self.logger:
|
606
|
-
self.logger.debug("Executed index statement: %s", stmt)
|
441
|
+
if df_value != mysql_value:
|
442
|
+
change_placeholders.append(f"`{raw_col}` = %s") # 动态转义列名
|
443
|
+
set_params.append(dict_data[raw_col])
|
444
|
+
change_col.append(raw_col)
|
607
445
|
|
608
|
-
|
609
|
-
|
610
|
-
|
446
|
+
if change_placeholders:
|
447
|
+
full_params = set_params + condition_params
|
448
|
+
sql = f"""UPDATE `{table_name}`
|
449
|
+
SET {','.join(change_placeholders)}
|
450
|
+
WHERE {' AND '.join(condition_parts)}"""
|
451
|
+
cursor.execute(sql, full_params)
|
452
|
+
else: # 没有数据返回,则直接插入数据
|
453
|
+
# 参数化插入语句
|
454
|
+
keys = [f"`{k}`" for k in dict_data.keys()]
|
455
|
+
placeholders = ','.join(['%s'] * len(dict_data))
|
456
|
+
update_clause = ','.join([f"`{k}`=VALUES(`{k}`)" for k in dict_data.keys()])
|
457
|
+
sql = f"""INSERT INTO `{table_name}` ({','.join(keys)}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause}"""
|
458
|
+
cursor.execute(sql, tuple(dict_data.values()))
|
459
|
+
connection.commit() # 提交数据库
|
460
|
+
connection.close()
|
461
|
+
return
|
611
462
|
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
463
|
+
# 常规插入处理(参数化)
|
464
|
+
keys = [f"`{k}`" for k in dict_data.keys()]
|
465
|
+
placeholders = ','.join(['%s'] * len(dict_data))
|
466
|
+
update_clause = ','.join([f"`{k}`=VALUES(`{k}`)" for k in dict_data.keys()])
|
467
|
+
sql = f"""INSERT INTO `{table_name}` ({','.join(keys)}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause}"""
|
468
|
+
cursor.execute(sql, tuple(dict_data.values()))
|
469
|
+
connection.commit()
|
470
|
+
connection.close()
|
617
471
|
|
618
|
-
def
|
619
|
-
self,
|
620
|
-
db_name: str,
|
621
|
-
table_name: str,
|
622
|
-
data: Union[Dict, List[Dict], pd.DataFrame],
|
623
|
-
columns: Dict[str, str],
|
624
|
-
primary_keys: Optional[List[str]] = None,
|
625
|
-
check_duplicate: bool = False,
|
626
|
-
duplicate_columns: Optional[List[str]] = None,
|
627
|
-
allow_null: bool = False,
|
628
|
-
partition_by: Optional[str] = None,
|
629
|
-
partition_date_column: str = '日期',
|
630
|
-
auto_create: bool = True,
|
631
|
-
replace: bool = False,
|
632
|
-
indexes: Optional[List[str]] = None
|
633
|
-
):
|
634
|
-
"""
|
635
|
-
上传数据到数据库
|
636
|
-
|
637
|
-
:param db_name: 数据库名
|
638
|
-
:param table_name: 表名
|
639
|
-
:param data: 要上传的数据
|
640
|
-
:param columns: 列名和数据类型字典 {列名: 数据类型}
|
641
|
-
:param primary_keys: 主键列列表
|
642
|
-
:param check_duplicate: 是否检查重复,默认为False
|
643
|
-
:param duplicate_columns: 用于检查重复的列列表,如果不指定则使用所有列
|
644
|
-
:param allow_null: 是否允许空值,默认为False
|
645
|
-
:param partition_by: 分表方式 ('year' 或 'month'),默认为None不分表
|
646
|
-
:param partition_date_column: 用于分表的日期列名,默认为'date'
|
647
|
-
:param auto_create: 是否自动创建不存在的数据库或表,默认为True
|
648
|
-
:param replace: 是否使用REPLACE代替INSERT,默认为False
|
649
|
-
:param indexes: 需要创建索引的列列表
|
650
|
-
:raises ValueError: 如果参数无效或操作失败
|
651
|
-
"""
|
652
|
-
if self.logger:
|
653
|
-
self.logger.info(
|
654
|
-
"开始上传数据到 %s.%s (分表方式=%s, 替换模式=%s)",
|
655
|
-
db_name, table_name, partition_by, replace
|
656
|
-
)
|
657
|
-
|
658
|
-
# 验证参数
|
659
|
-
if not columns:
|
660
|
-
error_msg = "Columns specification is required"
|
661
|
-
if self.logger:
|
662
|
-
self.logger.error(error_msg)
|
663
|
-
raise ValueError(error_msg)
|
664
|
-
|
665
|
-
if partition_by and partition_by not in ['year', 'month']:
|
666
|
-
error_msg = "分表方式必须是 'year' 或 'month'"
|
667
|
-
if self.logger:
|
668
|
-
self.logger.error(error_msg)
|
669
|
-
raise ValueError(error_msg)
|
670
|
-
|
671
|
-
# 准备数据
|
672
|
-
prepared_data = self._prepare_data(data, columns, allow_null)
|
673
|
-
|
674
|
-
# 检查数据库是否存在
|
675
|
-
if not self._check_database_exists(db_name):
|
676
|
-
if auto_create:
|
677
|
-
self._create_database(db_name)
|
678
|
-
else:
|
679
|
-
error_msg = f"Database '{db_name}' does not exist"
|
680
|
-
if self.logger:
|
681
|
-
self.logger.error(error_msg)
|
682
|
-
raise ValueError(error_msg)
|
683
|
-
|
684
|
-
# 确定唯一索引列
|
685
|
-
unique_columns = None
|
686
|
-
if check_duplicate:
|
687
|
-
unique_columns = duplicate_columns if duplicate_columns else [col for col in columns.keys() if
|
688
|
-
col.lower() != 'id']
|
689
|
-
|
690
|
-
# 处理分表逻辑
|
691
|
-
if partition_by:
|
692
|
-
# 分组数据按分表
|
693
|
-
partitioned_data = {}
|
694
|
-
for row in prepared_data:
|
695
|
-
if partition_date_column not in row:
|
696
|
-
error_msg = f"异常缺失列 '{partition_date_column}'"
|
697
|
-
if self.logger:
|
698
|
-
self.logger.error(error_msg)
|
699
|
-
raise ValueError(error_msg)
|
700
|
-
part_table = self._get_partition_table_name(table_name, str(row[partition_date_column]), partition_by)
|
701
|
-
if part_table not in partitioned_data:
|
702
|
-
partitioned_data[part_table] = []
|
703
|
-
partitioned_data[part_table].append(row)
|
704
|
-
|
705
|
-
# 对每个分表执行上传
|
706
|
-
for part_table, part_data in partitioned_data.items():
|
707
|
-
self._upload_to_table(
|
708
|
-
db_name, part_table, part_data, columns,
|
709
|
-
primary_keys, check_duplicate, duplicate_columns,
|
710
|
-
allow_null, auto_create, partition_date_column,
|
711
|
-
replace, indexes, unique_columns
|
712
|
-
)
|
713
|
-
else:
|
714
|
-
# 不分表,直接上传
|
715
|
-
self._upload_to_table(
|
716
|
-
db_name, table_name, prepared_data, columns,
|
717
|
-
primary_keys, check_duplicate, duplicate_columns,
|
718
|
-
allow_null, auto_create, partition_date_column,
|
719
|
-
replace, indexes, unique_columns
|
720
|
-
)
|
721
|
-
|
722
|
-
if self.logger:
|
723
|
-
self.logger.info(
|
724
|
-
"成功上传 %d 行数据到 %s.%s",
|
725
|
-
len(prepared_data), db_name, table_name
|
726
|
-
)
|
727
|
-
|
728
|
-
def _upload_to_table(
|
729
|
-
self,
|
730
|
-
db_name: str,
|
731
|
-
table_name: str,
|
732
|
-
data: List[Dict],
|
733
|
-
columns: Dict[str, str],
|
734
|
-
primary_keys: Optional[List[str]],
|
735
|
-
check_duplicate: bool,
|
736
|
-
duplicate_columns: Optional[List[str]],
|
737
|
-
allow_null: bool,
|
738
|
-
auto_create: bool,
|
739
|
-
date_column: Optional[str],
|
740
|
-
replace: bool,
|
741
|
-
indexes: Optional[List[str]],
|
742
|
-
unique_columns: Optional[List[str]] = None
|
743
|
-
):
|
744
|
-
"""实际执行表上传的内部方法"""
|
745
|
-
# 检查表是否存在
|
746
|
-
if not self._check_table_exists(db_name, table_name):
|
747
|
-
if auto_create:
|
748
|
-
self._create_table(db_name, table_name, columns, primary_keys, date_column, indexes, unique_columns)
|
749
|
-
else:
|
750
|
-
error_msg = f"Table '{db_name}.{table_name}' does not exist"
|
751
|
-
if self.logger:
|
752
|
-
self.logger.error(error_msg)
|
753
|
-
raise ValueError(error_msg)
|
754
|
-
|
755
|
-
# 获取表结构并验证
|
756
|
-
table_columns = self._get_table_columns(db_name, table_name)
|
757
|
-
if not table_columns:
|
758
|
-
error_msg = f"Failed to get columns for table '{db_name}.{table_name}'"
|
759
|
-
if self.logger:
|
760
|
-
self.logger.error(error_msg)
|
761
|
-
raise ValueError(error_msg)
|
762
|
-
|
763
|
-
# 验证数据列与表列匹配
|
764
|
-
for col in columns:
|
765
|
-
if col not in table_columns:
|
766
|
-
error_msg = f"Column '{col}' not found in table '{db_name}.{table_name}'"
|
767
|
-
if self.logger:
|
768
|
-
self.logger.error(error_msg)
|
769
|
-
raise ValueError(error_msg)
|
770
|
-
|
771
|
-
# 插入数据
|
772
|
-
self._insert_data(
|
773
|
-
db_name, table_name, data, columns,
|
774
|
-
check_duplicate, duplicate_columns,
|
775
|
-
replace=replace
|
776
|
-
)
|
777
|
-
|
778
|
-
def _insert_data(
|
779
|
-
self,
|
780
|
-
db_name: str,
|
781
|
-
table_name: str,
|
782
|
-
data: List[Dict],
|
783
|
-
columns: Dict[str, str],
|
784
|
-
check_duplicate: bool = False,
|
785
|
-
duplicate_columns: Optional[List[str]] = None,
|
786
|
-
batch_size: int = 1000,
|
787
|
-
replace: bool = False
|
788
|
-
):
|
789
|
-
"""
|
790
|
-
插入数据到表中
|
791
|
-
|
792
|
-
:param db_name: 数据库名
|
793
|
-
:param table_name: 表名
|
794
|
-
:param data: 要插入的数据
|
795
|
-
:param columns: 列名和数据类型字典
|
796
|
-
:param check_duplicate: 是否检查重复
|
797
|
-
:param duplicate_columns: 用于检查重复的列列表
|
798
|
-
:param batch_size: 批量插入的大小
|
799
|
-
:param replace: 是否使用REPLACE代替INSERT
|
800
|
-
:raises Exception: 如果插入失败
|
801
|
-
"""
|
802
|
-
db_name = self._validate_identifier(db_name)
|
803
|
-
table_name = self._validate_identifier(table_name)
|
804
|
-
|
805
|
-
if not data:
|
806
|
-
if self.logger:
|
807
|
-
self.logger.warning("No data to insert into %s.%s", db_name, table_name)
|
808
|
-
return
|
809
|
-
|
810
|
-
# 获取所有列名
|
811
|
-
all_columns = [col for col in columns.keys() if col.lower() != 'id']
|
812
|
-
safe_columns = [self._validate_identifier(col) for col in all_columns]
|
813
|
-
placeholders = ','.join(['%s'] * len(safe_columns))
|
814
|
-
|
815
|
-
# 构建SQL语句
|
816
|
-
operation = "REPLACE" if replace else "INSERT IGNORE" if check_duplicate else "INSERT"
|
817
|
-
|
818
|
-
if check_duplicate and not replace:
|
819
|
-
# 当check_duplicate=True时,使用INSERT IGNORE来跳过重复记录
|
820
|
-
sql = f"""
|
821
|
-
{operation} INTO `{db_name}`.`{table_name}`
|
822
|
-
(`{'`,`'.join(safe_columns)}`)
|
823
|
-
VALUES ({placeholders})
|
824
|
-
"""
|
825
|
-
else:
|
826
|
-
sql = f"""
|
827
|
-
{operation} INTO `{db_name}`.`{table_name}`
|
828
|
-
(`{'`,`'.join(safe_columns)}`)
|
829
|
-
VALUES ({placeholders})
|
830
|
-
"""
|
831
|
-
|
832
|
-
if len(self._prepared_statements) >= self._max_cached_statements:
|
833
|
-
# 移除最旧的缓存
|
834
|
-
oldest_key = next(iter(self._prepared_statements))
|
835
|
-
del self._prepared_statements[oldest_key]
|
836
|
-
|
837
|
-
# 缓存预处理语句
|
838
|
-
cache_key = f"{db_name}.{table_name}.{operation}.{check_duplicate}"
|
839
|
-
if cache_key not in self._prepared_statements:
|
840
|
-
self._prepared_statements[cache_key] = sql
|
841
|
-
if self.logger:
|
842
|
-
self.logger.debug("已缓存预处理语句: %s", cache_key)
|
843
|
-
|
844
|
-
# 分批插入数据
|
845
|
-
with self._get_connection() as conn:
|
846
|
-
with conn.cursor() as cursor:
|
847
|
-
for i in range(0, len(data), batch_size):
|
848
|
-
batch = data[i:i + batch_size]
|
849
|
-
# 准备批量数据
|
850
|
-
values = []
|
851
|
-
for row in batch:
|
852
|
-
row_values = []
|
853
|
-
for col in all_columns:
|
854
|
-
row_values.append(row.get(col))
|
855
|
-
values.append(row_values)
|
856
|
-
|
857
|
-
# 执行批量插入
|
858
|
-
try:
|
859
|
-
start_time = time.time()
|
860
|
-
cursor.executemany(sql, values)
|
861
|
-
conn.commit() # 每个批次提交一次
|
862
|
-
if self.logger:
|
863
|
-
self.logger.debug(
|
864
|
-
"成功插入批次 %d-%d/%d 到 %s.%s, 耗时 %.2f 秒",
|
865
|
-
i + 1, min(i + batch_size, len(data)), len(data),
|
866
|
-
db_name, table_name, time.time() - start_time
|
867
|
-
)
|
868
|
-
except Exception as e:
|
869
|
-
conn.rollback()
|
870
|
-
error_msg = f"Failed to insert batch {i + 1}-{min(i + batch_size, len(data))}/{len(data)} into {db_name}.{table_name}: {str(e)}"
|
871
|
-
if self.logger:
|
872
|
-
self.logger.error(error_msg)
|
873
|
-
raise Exception(error_msg)
|
874
|
-
|
875
|
-
def close(self):
|
876
|
-
"""关闭连接池"""
|
877
|
-
if hasattr(self, 'pool') and self.pool:
|
878
|
-
try:
|
879
|
-
# 先关闭所有连接
|
880
|
-
while True:
|
881
|
-
conn = getattr(self.pool, '_connections', None)
|
882
|
-
if not conn or not conn.queue:
|
883
|
-
break
|
884
|
-
try:
|
885
|
-
conn = self.pool.connection()
|
886
|
-
conn.close()
|
887
|
-
except:
|
888
|
-
pass
|
889
|
-
|
890
|
-
# 然后关闭连接池
|
891
|
-
self.pool.close()
|
892
|
-
if self.logger:
|
893
|
-
self.logger.info("连接池已成功关闭")
|
894
|
-
except Exception as e:
|
895
|
-
if self.logger:
|
896
|
-
self.logger.error("关闭连接池失败: %s", str(e))
|
897
|
-
raise
|
898
|
-
self.pool = None
|
899
|
-
|
900
|
-
def __enter__(self):
|
901
|
-
return self
|
902
|
-
|
903
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
904
|
-
self.close()
|
905
|
-
if exc_type is not None and self.logger:
|
906
|
-
self.logger.error(
|
907
|
-
"Exception occurred: %s: %s",
|
908
|
-
exc_type.__name__, str(exc_val),
|
909
|
-
exc_info=(exc_type, exc_val, exc_tb)
|
910
|
-
)
|
911
|
-
|
912
|
-
|
913
|
-
class MysqlUpload:
|
914
|
-
def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
|
915
|
-
self.username = username
|
916
|
-
self.password = password
|
917
|
-
self.host = host
|
918
|
-
self.port = port
|
919
|
-
if username == '' or password == '' or host == '' or port == 0:
|
920
|
-
self.config = None
|
921
|
-
else:
|
922
|
-
self.config = {
|
923
|
-
'host': self.host,
|
924
|
-
'port': int(self.port),
|
925
|
-
'user': self.username,
|
926
|
-
'password': self.password,
|
927
|
-
'charset': charset, # utf8mb4 支持存储四字节的UTF-8字符集
|
928
|
-
'cursorclass': pymysql.cursors.DictCursor,
|
929
|
-
}
|
930
|
-
self.filename = None
|
931
|
-
|
932
|
-
@staticmethod
|
933
|
-
def try_except(func): # 在类内部定义一个异常处理方法
|
934
|
-
|
935
|
-
@wraps(func)
|
936
|
-
def wrapper(*args, **kwargs):
|
937
|
-
try:
|
938
|
-
return func(*args, **kwargs)
|
939
|
-
except Exception as e:
|
940
|
-
logger.error(f'{func.__name__}, {e}') # 将异常信息返回
|
941
|
-
|
942
|
-
return wrapper
|
943
|
-
|
944
|
-
def keep_connect(self, _db_name, _config, max_try: int=10):
|
945
|
-
attempts = 1
|
946
|
-
while attempts <= max_try:
|
947
|
-
try:
|
948
|
-
connection = pymysql.connect(**_config) # 连接数据库
|
949
|
-
return connection
|
950
|
-
except Exception as e:
|
951
|
-
logger.error(f'{_db_name}: 连接失败,正在重试: {self.host}:{self.port} {attempts}/{max_try} {e}')
|
952
|
-
attempts += 1
|
953
|
-
time.sleep(30)
|
954
|
-
logger.error(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
|
955
|
-
return None
|
956
|
-
|
957
|
-
def cover_doc_dtypes(self, dict_data):
|
472
|
+
def cover_dict_dtypes(self, dict_data):
|
958
473
|
""" 清理字典键值 并转换数据类型 """
|
959
474
|
if not dict_data:
|
960
475
|
logger.info(f'mysql.py -> MysqlUpload -> cover_dict_dtypes -> 传入的字典不能为空')
|
@@ -967,6 +482,14 @@ class MysqlUpload:
|
|
967
482
|
k = k.replace(')', '')
|
968
483
|
k = re.sub(r'_{2,}', '_', k)
|
969
484
|
k = re.sub(r'_+$', '', k)
|
485
|
+
if str(v) == '':
|
486
|
+
v = 0
|
487
|
+
v = str(v)
|
488
|
+
v = re.sub('^="|"$', '', v, re.I)
|
489
|
+
v = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', str(v)) # 移除控制字符
|
490
|
+
if re.findall(r'^[-+]?\d+\.?\d*%$', v):
|
491
|
+
v = str(float(v.rstrip("%")) / 100)
|
492
|
+
|
970
493
|
result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
|
971
494
|
result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
|
972
495
|
result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
|
@@ -991,6 +514,8 @@ class MysqlUpload:
|
|
991
514
|
__res_dict.update({k: 'INT'})
|
992
515
|
elif count_float > 0:
|
993
516
|
if count_int + count_float > 10:
|
517
|
+
# if count_float > 5:
|
518
|
+
# v = round(float(v), 4)
|
994
519
|
if count_float >= 6:
|
995
520
|
__res_dict.update({k: 'decimal(14,6)'})
|
996
521
|
else:
|
@@ -1004,45 +529,110 @@ class MysqlUpload:
|
|
1004
529
|
else:
|
1005
530
|
__res_dict.update({k: 'varchar(255)'})
|
1006
531
|
new_dict_data.update({k: v})
|
1007
|
-
__res_dict.update({'数据主体': 'longblob'})
|
1008
532
|
return __res_dict, new_dict_data
|
1009
533
|
|
534
|
+
def convert_df_dtypes(self, df: pd.DataFrame):
|
535
|
+
""" 清理 df 的值和列名,并转换数据类型 """
|
536
|
+
df = otk.cover_df(df=df) # 清理 df 的值和列名
|
537
|
+
[pd.to_numeric(df[col], errors='ignore') for col in df.columns.tolist()]
|
538
|
+
dtypes = df.dtypes.to_dict()
|
539
|
+
__res_dict = {}
|
540
|
+
for k, v in dtypes.copy().items():
|
541
|
+
result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
|
542
|
+
result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
|
543
|
+
result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
|
544
|
+
result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
|
545
|
+
|
546
|
+
if result1: # id/sku/spu商品信息
|
547
|
+
__res_dict.update({k: 'varchar(50)'})
|
548
|
+
elif result2: # 小数
|
549
|
+
__res_dict.update({k: 'decimal(10,4)'})
|
550
|
+
elif result3: # 小数
|
551
|
+
__res_dict.update({k: 'decimal(12,4)'})
|
552
|
+
elif result4: # 小数
|
553
|
+
__res_dict.update({k: 'decimal(12,2)'})
|
554
|
+
elif k == '日期':
|
555
|
+
__res_dict.update({k: 'date'})
|
556
|
+
elif k == '更新时间':
|
557
|
+
__res_dict.update({k: 'timestamp'})
|
558
|
+
elif v == 'int64':
|
559
|
+
__res_dict.update({k: 'int'})
|
560
|
+
elif v == 'float64':
|
561
|
+
__res_dict.update({k: 'decimal(10,4)'})
|
562
|
+
elif v == 'bool':
|
563
|
+
__res_dict.update({k: 'boolean'})
|
564
|
+
elif v == 'datetime64[ns]':
|
565
|
+
__res_dict.update({k: 'datetime'})
|
566
|
+
else:
|
567
|
+
__res_dict.update({k: 'varchar(255)'})
|
568
|
+
return __res_dict, df
|
569
|
+
|
1010
570
|
@try_except
|
1011
|
-
def
|
571
|
+
def df_to_mysql(self, df, db_name, table_name, set_typ=None, icm_update=[], move_insert=False, df_sql=False,
|
572
|
+
filename=None, count=None, allow_not_null=False, cut_data=None):
|
1012
573
|
"""
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1016
|
-
|
1017
|
-
|
574
|
+
db_name: 数据库名
|
575
|
+
table_name: 表名
|
576
|
+
move_insert: 根据df 的日期,先移除数据库数据,再插入, df_sql, icm_update 都要设置为 False
|
577
|
+
原则上只限于聚合数据使用,原始数据插入时不要设置
|
578
|
+
df_sql: 这是一个临时参数, 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重,初创表大量上传数据的时候使用
|
579
|
+
icm_update: 增量更新, 在聚合数据中使用,原始文件不要使用
|
580
|
+
使用增量更新: 必须确保 icm_update 传进来的列必须是数据表中唯一主键,值不会发生变化,不会重复,否则可能产生错乱覆盖情况
|
581
|
+
filename: 用来追踪处理进度,传这个参数是方便定位产生错误的文件
|
1018
582
|
allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
|
1019
583
|
"""
|
1020
584
|
if not self.config:
|
1021
585
|
return
|
586
|
+
if icm_update:
|
587
|
+
if move_insert or df_sql:
|
588
|
+
logger.info(f'icm_update/move_insert/df_sql 参数不能同时设定')
|
589
|
+
return
|
590
|
+
if move_insert:
|
591
|
+
if icm_update or df_sql:
|
592
|
+
logger.info(f'icm_update/move_insert/df_sql 参数不能同时设定')
|
593
|
+
return
|
1022
594
|
|
1023
|
-
|
1024
|
-
|
595
|
+
self.filename = filename
|
596
|
+
if isinstance(df, pd.DataFrame):
|
597
|
+
if len(df) == 0:
|
598
|
+
logger.info(f'{db_name}: {table_name} 传入的 df 数据长度为0, {self.filename}')
|
599
|
+
return
|
600
|
+
else:
|
601
|
+
logger.info(f'{db_name}: {table_name} 传入的 df 不是有效的 dataframe 结构, {self.filename}')
|
1025
602
|
return
|
1026
|
-
|
603
|
+
if not db_name or db_name == 'None':
|
604
|
+
logger.info(f'{db_name} 不能为 None')
|
605
|
+
return
|
606
|
+
|
1027
607
|
if cut_data:
|
1028
|
-
if '日期' in
|
608
|
+
if '日期' in df.columns.tolist():
|
1029
609
|
try:
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
610
|
+
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
|
611
|
+
min_year = df['日期'].min(skipna=True).year
|
612
|
+
min_month = df['日期'].min(skipna=True).month
|
613
|
+
if 0 < int(min_month) < 10 and not str(min_month).startswith('0'):
|
614
|
+
min_month = f'0{min_month}'
|
615
|
+
if str(cut_data).lower() == 'year':
|
616
|
+
table_name = f'{table_name}_{min_year}'
|
1034
617
|
elif str(cut_data).lower() == 'month':
|
1035
|
-
table_name = f'{table_name}_{
|
618
|
+
table_name = f'{table_name}_{min_year}-{min_month}'
|
1036
619
|
else:
|
1037
620
|
logger.info(f'参数不正确,cut_data应为 year 或 month ')
|
1038
621
|
except Exception as e:
|
1039
622
|
logger.error(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
|
623
|
+
# 清理 dataframe 非法值,并转换获取数据类型
|
624
|
+
dtypes, df = self.convert_df_dtypes(df)
|
625
|
+
if set_typ:
|
626
|
+
# 更新自定义的列数据类型
|
627
|
+
for k, v in dtypes.copy().items():
|
628
|
+
# 确保传进来的 set_typ 键存在于实际的 df 列才 update
|
629
|
+
[dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
|
1040
630
|
|
1041
631
|
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
1042
632
|
if not connection:
|
1043
633
|
return
|
1044
634
|
with connection.cursor() as cursor:
|
1045
|
-
cursor.execute(
|
635
|
+
cursor.execute("SHOW DATABASES LIKE %s", (db_name,)) # 检查数据库是否存在
|
1046
636
|
database_exists = cursor.fetchone()
|
1047
637
|
if not database_exists:
|
1048
638
|
# 如果数据库不存在,则新建
|
@@ -1060,916 +650,1536 @@ class MysqlUpload:
|
|
1060
650
|
sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
|
1061
651
|
cursor.execute(sql, (table_name,))
|
1062
652
|
if not cursor.fetchone():
|
1063
|
-
|
1064
|
-
cursor.execute(
|
653
|
+
create_table_sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY)"
|
654
|
+
cursor.execute(create_table_sql)
|
1065
655
|
logger.info(f'创建 mysql 表: {table_name}')
|
1066
656
|
|
1067
|
-
#
|
1068
|
-
dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
|
1069
|
-
if set_typ:
|
1070
|
-
# 更新自定义的列数据类型
|
1071
|
-
for k, v in dtypes.copy().items():
|
1072
|
-
# 确保传进来的 set_typ 键存在于实际的 df 列才 update
|
1073
|
-
[dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
|
1074
|
-
|
1075
|
-
# 检查列
|
657
|
+
# 有特殊字符不需转义
|
1076
658
|
sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
|
1077
659
|
cursor.execute(sql, (db_name, table_name))
|
1078
|
-
col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()]
|
1079
|
-
|
1080
|
-
|
660
|
+
col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()]
|
661
|
+
cols = df.columns.tolist()
|
662
|
+
col_not_exist = [col for col in cols if col not in col_exist]
|
663
|
+
|
664
|
+
# 检查列,不存在则新建列
|
1081
665
|
if col_not_exist: # 数据表中不存在的列
|
1082
666
|
for col in col_not_exist:
|
1083
667
|
# 创建列,需转义
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
cursor.execute(sql)
|
668
|
+
alter_sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]}"
|
669
|
+
if not allow_not_null:
|
670
|
+
alter_sql += " NOT NULL"
|
671
|
+
cursor.execute(alter_sql)
|
1090
672
|
logger.info(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
|
1091
673
|
|
674
|
+
# 创建索引
|
1092
675
|
if col == '日期':
|
1093
|
-
sql = f"
|
1094
|
-
|
1095
|
-
cursor.
|
1096
|
-
|
676
|
+
sql = f"SHOW INDEXES FROM `{table_name}` WHERE `Column_name` = %s"
|
677
|
+
cursor.execute(sql, (col,))
|
678
|
+
result = cursor.fetchone() # 检查索引是否存在
|
679
|
+
if not result:
|
680
|
+
cursor.execute(f"CREATE INDEX index_name ON `{table_name}`(`{col}`)")
|
1097
681
|
connection.commit() # 提交事务
|
1098
|
-
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
1099
|
-
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
1100
|
-
# 处理插入的数据
|
1101
|
-
for dict_data in dict_data_list:
|
1102
|
-
dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
|
1103
|
-
if icm_update:
|
1104
|
-
""" 使用增量更新: 需确保 icm_update['主键'] 传进来的列组合是数据表中唯一,值不会发生变化且不会重复,否则可能产生覆盖 """
|
1105
|
-
sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
|
1106
|
-
cursor.execute(sql, (db_name, table_name))
|
1107
|
-
columns = cursor.fetchall()
|
1108
|
-
cols_exist = [col['COLUMN_NAME'] for col in columns] # 数据表的所有列, 返回 list
|
1109
|
-
# 保留原始列名,不提前转义
|
1110
|
-
raw_update_col = [item for item in cols_exist if item not in icm_update and item != 'id'] # 除了主键外的其他列
|
1111
|
-
|
1112
|
-
# 构建条件参数(使用原始列名)
|
1113
|
-
condition_params = []
|
1114
|
-
condition_parts = []
|
1115
|
-
for up_col in icm_update:
|
1116
|
-
condition_parts.append(f"`{up_col}` = %s") # SQL 转义
|
1117
|
-
condition_params.append(dict_data[up_col]) # 原始列名用于访问数据
|
1118
682
|
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
683
|
+
if df_sql:
|
684
|
+
logger.info(f'正在更新: mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count}, {self.filename}')
|
685
|
+
engine = create_engine(
|
686
|
+
f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
|
687
|
+
df.to_sql(
|
688
|
+
name=table_name,
|
689
|
+
con=engine,
|
690
|
+
if_exists='append',
|
691
|
+
index=False,
|
692
|
+
chunksize=1000,
|
693
|
+
method='multi'
|
694
|
+
)
|
695
|
+
connection.commit() # 提交事务
|
696
|
+
connection.close()
|
697
|
+
return
|
1124
698
|
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
df_value = str(dict_data[raw_col])
|
1133
|
-
mysql_value = str(result[raw_col])
|
699
|
+
# 5. 移除指定日期范围内的数据,原则上只限于聚合数据使用,原始数据插入时不要设置
|
700
|
+
if move_insert and '日期' in df.columns.tolist():
|
701
|
+
# 移除数据
|
702
|
+
dates = df['日期'].values.tolist()
|
703
|
+
dates = [pd.to_datetime(item) for item in dates] # 需要先转换类型才能用 min, max
|
704
|
+
start_date = pd.to_datetime(min(dates)).strftime('%Y-%m-%d')
|
705
|
+
end_date = (pd.to_datetime(max(dates)) + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
|
1134
706
|
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
|
1139
|
-
|
707
|
+
delete_sql = f"""
|
708
|
+
DELETE FROM `{table_name}`
|
709
|
+
WHERE 日期 BETWEEN %s AND %s
|
710
|
+
"""
|
711
|
+
cursor.execute(delete_sql, (start_date, end_date))
|
712
|
+
connection.commit()
|
1140
713
|
|
1141
|
-
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
714
|
+
# 插入数据
|
715
|
+
engine = create_engine(
|
716
|
+
f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
|
717
|
+
df.to_sql(
|
718
|
+
name=table_name,
|
719
|
+
con=engine,
|
720
|
+
if_exists='append',
|
721
|
+
index=False,
|
722
|
+
chunksize=1000,
|
723
|
+
method='multi'
|
724
|
+
)
|
725
|
+
return
|
1145
726
|
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
727
|
+
datas = df.to_dict(orient='records')
|
728
|
+
for data in datas:
|
729
|
+
# data 是传进来待处理的数据, 不是数据库数据
|
730
|
+
# data 示例: {'日期': Timestamp('2024-08-27 00:00:00'), '推广费余额': 33299, '品销宝余额': 2930.73, '短信剩余': 67471}
|
731
|
+
try:
|
732
|
+
# 预处理数据:转换非字符串类型
|
733
|
+
processed_data = {}
|
734
|
+
for k, v in data.items():
|
735
|
+
if isinstance(v, (int, float)):
|
736
|
+
processed_data[k] = float(v)
|
737
|
+
elif isinstance(v, pd.Timestamp):
|
738
|
+
processed_data[k] = v.strftime('%Y-%m-%d')
|
739
|
+
else:
|
740
|
+
processed_data[k] = str(v)
|
1160
741
|
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
cols = ', '.join([f'`{k}`' for k in dict_data.keys()])
|
1166
|
-
placeholders = ', '.join(['%s'] * len(dict_data))
|
1167
|
-
update_clause = ', '.join([f'`{k}` = VALUES(`{k}`)' for k in update_cols]) or 'id=id'
|
742
|
+
# 构建基础SQL要素
|
743
|
+
columns = [f'`{k}`' for k in processed_data.keys()]
|
744
|
+
placeholders = ', '.join(['%s'] * len(processed_data))
|
745
|
+
values = list(processed_data.values())
|
1168
746
|
|
1169
|
-
|
1170
|
-
|
1171
|
-
try:
|
1172
|
-
cursor.execute(sql, tuple(dict_data.values()))
|
1173
|
-
connection.commit()
|
1174
|
-
except pymysql.Error as e:
|
1175
|
-
logger.error(f"插入失败: {e}\nSQL: {cursor.mogrify(sql, tuple(dict_data.values()))}")
|
1176
|
-
connection.rollback()
|
1177
|
-
connection.close()
|
747
|
+
# 构建基本INSERT语句
|
748
|
+
insert_sql = f"INSERT INTO `{table_name}` ({', '.join(columns)}) VALUES ({placeholders})"
|
1178
749
|
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1182
|
-
|
1183
|
-
|
1184
|
-
|
1185
|
-
|
1186
|
-
|
1187
|
-
|
1188
|
-
"""
|
1189
|
-
if not self.config:
|
1190
|
-
return
|
750
|
+
if icm_update: # 增量更新, 专门用于聚合数据,其他库不要调用
|
751
|
+
# 获取数据表结构
|
752
|
+
cursor.execute(
|
753
|
+
"SELECT COLUMN_NAME FROM information_schema.columns "
|
754
|
+
"WHERE table_schema = %s AND table_name = %s",
|
755
|
+
(db_name, table_name)
|
756
|
+
)
|
757
|
+
cols_exist = [row['COLUMN_NAME'] for row in cursor.fetchall()]
|
758
|
+
update_columns = [col for col in cols_exist if col not in icm_update and col != 'id']
|
1191
759
|
|
1192
|
-
|
1193
|
-
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
table_name = f'{table_name}_{__y}'
|
1199
|
-
elif str(cut_data).lower() == 'month':
|
1200
|
-
table_name = f'{table_name}_{__y_m}'
|
1201
|
-
else:
|
1202
|
-
logger.info(f'参数不正确,cut_data应为 year 或 month ')
|
1203
|
-
except Exception as e:
|
1204
|
-
logger.error(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
|
760
|
+
# 构建WHERE条件
|
761
|
+
where_conditions = []
|
762
|
+
where_values = []
|
763
|
+
for col in icm_update:
|
764
|
+
where_conditions.append(f"`{col}` = %s")
|
765
|
+
where_values.append(processed_data[col])
|
1205
766
|
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1209
|
-
|
1210
|
-
|
1211
|
-
database_exists = cursor.fetchone()
|
1212
|
-
if not database_exists:
|
1213
|
-
# 如果数据库不存在,则新建
|
1214
|
-
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
1215
|
-
cursor.execute(sql)
|
1216
|
-
connection.commit()
|
1217
|
-
logger.info(f"创建Database: {db_name}")
|
767
|
+
# 查询现有数据
|
768
|
+
select_sql = f"SELECT {', '.join([f'`{col}`' for col in update_columns])} " \
|
769
|
+
f"FROM `{table_name}` WHERE {' AND '.join(where_conditions)}"
|
770
|
+
cursor.execute(select_sql, where_values)
|
771
|
+
existing_data = cursor.fetchone()
|
1218
772
|
|
1219
|
-
|
1220
|
-
|
1221
|
-
|
1222
|
-
|
1223
|
-
|
1224
|
-
|
1225
|
-
|
1226
|
-
cursor.execute(sql, (table_name,))
|
1227
|
-
if not cursor.fetchone():
|
1228
|
-
sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
|
1229
|
-
cursor.execute(sql)
|
1230
|
-
logger.info(f'创建 mysql 表: {table_name}')
|
773
|
+
if existing_data:
|
774
|
+
# 比较并构建更新语句
|
775
|
+
update_set = []
|
776
|
+
update_values = []
|
777
|
+
for col in update_columns:
|
778
|
+
db_value = existing_data[col]
|
779
|
+
new_value = processed_data[col]
|
1231
780
|
|
1232
|
-
|
1233
|
-
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1237
|
-
|
1238
|
-
|
781
|
+
# 处理数值类型的精度差异
|
782
|
+
if isinstance(db_value, float) and isinstance(new_value, float):
|
783
|
+
if not math.isclose(db_value, new_value, rel_tol=1e-9):
|
784
|
+
update_set.append(f"`{col}` = %s")
|
785
|
+
update_values.append(new_value)
|
786
|
+
elif db_value != new_value:
|
787
|
+
update_set.append(f"`{col}` = %s")
|
788
|
+
update_values.append(new_value)
|
1239
789
|
|
1240
|
-
|
1241
|
-
|
1242
|
-
|
1243
|
-
|
1244
|
-
|
1245
|
-
|
1246
|
-
if col_not_exist: # 数据表中不存在的列
|
1247
|
-
for col in col_not_exist:
|
1248
|
-
# 创建列,需转义
|
1249
|
-
if allow_not_null:
|
1250
|
-
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]};"
|
790
|
+
if update_set:
|
791
|
+
update_sql = f"UPDATE `{table_name}` SET {', '.join(update_set)} " \
|
792
|
+
f"WHERE {' AND '.join(where_conditions)}"
|
793
|
+
cursor.execute(update_sql, update_values + where_values)
|
794
|
+
else:
|
795
|
+
cursor.execute(insert_sql, values)
|
1251
796
|
else:
|
1252
|
-
|
797
|
+
# 普通插入
|
798
|
+
cursor.execute(insert_sql, values)
|
799
|
+
except Exception as e:
|
800
|
+
pass
|
801
|
+
connection.commit() # 提交事务
|
802
|
+
connection.close()
|
803
|
+
|
804
|
+
|
805
|
+
class OptimizeDatas:
|
806
|
+
"""
|
807
|
+
数据维护 删除 mysql 的冗余数据
|
808
|
+
更新过程:
|
809
|
+
1. 读取所有数据表
|
810
|
+
2. 遍历表, 遍历列, 如果存在日期列则按天遍历所有日期, 不存在则全表读取
|
811
|
+
3. 按天删除所有冗余数据(存在日期列时)
|
812
|
+
tips: 查找冗余数据的方式是创建一个临时迭代器, 逐行读取数据并添加到迭代器, 出现重复时将重复数据的 id 添加到临时列表, 按列表 id 执行删除
|
813
|
+
"""
|
814
|
+
def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
|
815
|
+
self.username = username
|
816
|
+
self.password = password
|
817
|
+
self.host = host
|
818
|
+
self.port = port # 默认端口, 此后可能更新,不作为必传参数
|
819
|
+
self.charset = charset
|
820
|
+
self.config = {
|
821
|
+
'host': self.host,
|
822
|
+
'port': int(self.port),
|
823
|
+
'user': self.username,
|
824
|
+
'password': self.password,
|
825
|
+
'charset': self.charset, # utf8mb4 支持存储四字节的UTF-8字符集
|
826
|
+
'cursorclass': pymysql.cursors.DictCursor,
|
827
|
+
}
|
828
|
+
self.db_name_lists: list = [] # 更新多个数据库 删除重复数据
|
829
|
+
self.db_name = None
|
830
|
+
self.days: int = 63 # 对近 N 天的数据进行排重
|
831
|
+
self.end_date = None
|
832
|
+
self.start_date = None
|
833
|
+
self.connection = None
|
834
|
+
|
835
|
+
@staticmethod
|
836
|
+
def try_except(func): # 在类内部定义一个异常处理方法
|
837
|
+
|
838
|
+
@wraps(func)
|
839
|
+
def wrapper(*args, **kwargs):
|
840
|
+
try:
|
841
|
+
return func(*args, **kwargs)
|
842
|
+
except Exception as e:
|
843
|
+
logger.error(f'{func.__name__}, {e}') # 将异常信息返回
|
844
|
+
|
845
|
+
return wrapper
|
846
|
+
|
847
|
+
def keep_connect(self, _db_name, _config, max_try: int=10):
|
848
|
+
attempts = 1
|
849
|
+
while attempts <= max_try:
|
850
|
+
try:
|
851
|
+
connection = pymysql.connect(**_config) # 连接数据库
|
852
|
+
return connection
|
853
|
+
except Exception as e:
|
854
|
+
logger.error(f'{_db_name}连接失败,正在重试: {self.host}:{self.port} {attempts}/{max_try} {e}')
|
855
|
+
attempts += 1
|
856
|
+
time.sleep(30)
|
857
|
+
logger.error(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
|
858
|
+
return None
|
859
|
+
|
860
|
+
def optimize_list(self):
|
861
|
+
"""
|
862
|
+
更新多个数据库 移除冗余数据
|
863
|
+
需要设置 self.db_name_lists
|
864
|
+
"""
|
865
|
+
if not self.db_name_lists:
|
866
|
+
logger.info(f'尚未设置参数: self.db_name_lists')
|
867
|
+
return
|
868
|
+
for db_name in self.db_name_lists:
|
869
|
+
self.db_name = db_name
|
870
|
+
self.optimize()
|
871
|
+
|
872
|
+
def optimize(self, except_key=['更新时间']):
|
873
|
+
""" 更新一个数据库 移除冗余数据 """
|
874
|
+
if not self.db_name:
|
875
|
+
logger.info(f'尚未设置参数: self.db_name')
|
876
|
+
return
|
877
|
+
tables = self.table_list(db_name=self.db_name)
|
878
|
+
if not tables:
|
879
|
+
logger.info(f'{self.db_name} -> 数据表不存在')
|
880
|
+
return
|
881
|
+
|
882
|
+
# 日期初始化
|
883
|
+
if not self.end_date:
|
884
|
+
self.end_date = pd.to_datetime(datetime.datetime.today())
|
885
|
+
else:
|
886
|
+
self.end_date = pd.to_datetime(self.end_date)
|
887
|
+
if self.days:
|
888
|
+
self.start_date = pd.to_datetime(self.end_date - datetime.timedelta(days=self.days))
|
889
|
+
if not self.start_date:
|
890
|
+
self.start_date = self.end_date
|
891
|
+
else:
|
892
|
+
self.start_date = pd.to_datetime(self.start_date)
|
893
|
+
start_date_before = self.start_date
|
894
|
+
end_date_before = self.end_date
|
895
|
+
|
896
|
+
logger.info(f'mysql({self.host}: {self.port}) {self.db_name} 数据库优化中(日期长度: {self.days} 天)...')
|
897
|
+
for table_dict in tables:
|
898
|
+
for key, table_name in table_dict.items():
|
899
|
+
self.config.update({'database': self.db_name}) # 添加更新 config 字段
|
900
|
+
self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
|
901
|
+
if not self.connection:
|
902
|
+
return
|
903
|
+
with self.connection.cursor() as cursor:
|
904
|
+
sql = f"SELECT 1 FROM `{table_name}` LIMIT 1"
|
1253
905
|
cursor.execute(sql)
|
1254
|
-
|
906
|
+
result = cursor.fetchone()
|
907
|
+
if not result:
|
908
|
+
logger.info(f'数据表: {table_name}, 数据长度为 0')
|
909
|
+
continue # 检查数据表是否为空
|
1255
910
|
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1266
|
-
|
1267
|
-
|
1268
|
-
|
911
|
+
cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`") # 查询数据表的列信息
|
912
|
+
columns = cursor.fetchall()
|
913
|
+
date_exist = False
|
914
|
+
for col in columns: # 遍历列信息,检查是否存在类型为日期的列
|
915
|
+
if col['Field'] == '日期' and (col['Type'] == 'date' or col['Type'].startswith('datetime')):
|
916
|
+
date_exist = True
|
917
|
+
break
|
918
|
+
if date_exist: # 存在日期列
|
919
|
+
sql_max = f"SELECT MAX(日期) AS max_date FROM `{table_name}`"
|
920
|
+
sql_min = f"SELECT MIN(日期) AS min_date FROM `{table_name}`"
|
921
|
+
cursor.execute(sql_max)
|
922
|
+
max_result = cursor.fetchone()
|
923
|
+
cursor.execute(sql_min)
|
924
|
+
min_result = cursor.fetchone()
|
925
|
+
# 匹配修改为合适的起始和结束日期
|
926
|
+
if self.start_date < pd.to_datetime(min_result['min_date']):
|
927
|
+
self.start_date = pd.to_datetime(min_result['min_date'])
|
928
|
+
if self.end_date > pd.to_datetime(max_result['max_date']):
|
929
|
+
self.end_date = pd.to_datetime(max_result['max_date'])
|
930
|
+
dates_list = self.day_list(start_date=self.start_date, end_date=self.end_date)
|
931
|
+
# dates_list 是日期列表
|
932
|
+
for date in dates_list:
|
933
|
+
self.delete_duplicate(table_name=table_name, date=date, except_key=except_key)
|
934
|
+
self.start_date = start_date_before # 重置,不然日期错乱
|
935
|
+
self.end_date = end_date_before
|
936
|
+
else: # 不存在日期列的情况
|
937
|
+
self.delete_duplicate2(table_name=table_name, except_key=except_key)
|
938
|
+
self.connection.close()
|
939
|
+
logger.info(f'mysql({self.host}: {self.port}) {self.db_name} 数据库优化完成!')
|
1269
940
|
|
1270
|
-
|
1271
|
-
|
941
|
+
def delete_duplicate(self, table_name, date, except_key=['更新时间']):
|
942
|
+
datas = self.table_datas(db_name=self.db_name, table_name=str(table_name), date=date)
|
943
|
+
if not datas:
|
944
|
+
return
|
945
|
+
duplicate_id = [] # 出现重复的 id
|
946
|
+
all_datas = [] # 迭代器
|
947
|
+
for data in datas:
|
948
|
+
for e_key in except_key:
|
949
|
+
if e_key in data.keys(): # 在检查重复数据时,不包含 更新时间 字段
|
950
|
+
del data[e_key]
|
951
|
+
try:
|
952
|
+
delete_id = data['id']
|
953
|
+
del data['id']
|
954
|
+
data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
|
955
|
+
if data in all_datas: # 数据出现重复时
|
956
|
+
if delete_id:
|
957
|
+
duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
|
958
|
+
continue
|
959
|
+
all_datas.append(data) # 数据没有重复
|
960
|
+
except Exception as e:
|
961
|
+
logger.debug(f'{table_name} 函数: mysql - > OptimizeDatas -> delete_duplicate -> {e}')
|
962
|
+
del all_datas
|
1272
963
|
|
1273
|
-
|
1274
|
-
|
1275
|
-
condition_parts = []
|
1276
|
-
for up_col in icm_update:
|
1277
|
-
condition_parts.append(f"`{up_col}` = %s") # SQL 转义
|
1278
|
-
condition_params.append(dict_data[up_col]) # 原始列名访问数据
|
964
|
+
if not duplicate_id: # 如果没有重复数据,则跳过该数据表
|
965
|
+
return
|
1279
966
|
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1283
|
-
|
967
|
+
try:
|
968
|
+
with self.connection.cursor() as cursor:
|
969
|
+
placeholders = ', '.join(['%s'] * len(duplicate_id))
|
970
|
+
# 移除冗余数据
|
971
|
+
sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
|
972
|
+
cursor.execute(sql, duplicate_id)
|
973
|
+
logger.debug(f"{table_name} -> {date.strftime('%Y-%m-%d')} before: {len(datas)}, remove: {cursor.rowcount}")
|
974
|
+
self.connection.commit() # 提交事务
|
975
|
+
except Exception as e:
|
976
|
+
logger.error(f'{self.db_name}/{table_name}, {e}')
|
977
|
+
self.connection.rollback() # 异常则回滚
|
978
|
+
|
979
|
+
def delete_duplicate2(self, table_name, except_key=['更新时间']):
|
980
|
+
with self.connection.cursor() as cursor:
|
981
|
+
sql = f"SELECT * FROM `{table_name}`" # 如果不包含日期列,则获取全部数据
|
982
|
+
cursor.execute(sql)
|
983
|
+
datas = cursor.fetchall()
|
984
|
+
if not datas:
|
985
|
+
return
|
986
|
+
duplicate_id = [] # 出现重复的 id
|
987
|
+
all_datas = [] # 迭代器
|
988
|
+
for data in datas:
|
989
|
+
for e_key in except_key:
|
990
|
+
if e_key in data.keys(): # 在检查重复数据时,不包含 更新时间 字段
|
991
|
+
del data[e_key]
|
992
|
+
delete_id = data['id']
|
993
|
+
del data['id']
|
994
|
+
data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
|
995
|
+
if data in all_datas: # 数据出现重复时
|
996
|
+
duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
|
997
|
+
continue
|
998
|
+
all_datas.append(data) # 数据没有重复
|
999
|
+
del all_datas
|
1000
|
+
|
1001
|
+
if not duplicate_id: # 如果没有重复数据,则跳过该数据表
|
1002
|
+
return
|
1003
|
+
|
1004
|
+
try:
|
1005
|
+
with self.connection.cursor() as cursor:
|
1006
|
+
placeholders = ', '.join(['%s'] * len(duplicate_id))
|
1007
|
+
# 移除冗余数据
|
1008
|
+
sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
|
1009
|
+
cursor.execute(sql, duplicate_id)
|
1010
|
+
logger.info(f"{table_name} -> before: {len(datas)}, "
|
1011
|
+
f"remove: {cursor.rowcount}")
|
1012
|
+
self.connection.commit() # 提交事务
|
1013
|
+
except Exception as e:
|
1014
|
+
logger.error(f'{self.db_name}/{table_name}, {e}')
|
1015
|
+
self.connection.rollback() # 异常则回滚
|
1016
|
+
|
1017
|
+
def database_list(self):
|
1018
|
+
""" 获取所有数据库 """
|
1019
|
+
connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
|
1020
|
+
if not connection:
|
1021
|
+
return
|
1022
|
+
with connection.cursor() as cursor:
|
1023
|
+
cursor.execute("SHOW DATABASES")
|
1024
|
+
databases = cursor.fetchall() # 获取所有数据库的结果
|
1025
|
+
connection.close()
|
1026
|
+
return databases
|
1027
|
+
|
1028
|
+
def table_list(self, db_name):
|
1029
|
+
""" 获取指定数据库的所有数据表 """
|
1030
|
+
connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
|
1031
|
+
if not connection:
|
1032
|
+
return
|
1033
|
+
try:
|
1034
|
+
with connection.cursor() as cursor:
|
1035
|
+
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
1036
|
+
database_exists = cursor.fetchone()
|
1037
|
+
if not database_exists:
|
1038
|
+
logger.info(f'{db_name}: 数据表不存在!')
|
1039
|
+
return
|
1040
|
+
except Exception as e:
|
1041
|
+
logger.error(f'002 {e}')
|
1042
|
+
return
|
1043
|
+
finally:
|
1044
|
+
connection.close() # 断开连接
|
1045
|
+
|
1046
|
+
self.config.update({'database': db_name}) # 添加更新 config 字段
|
1047
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
1048
|
+
if not connection:
|
1049
|
+
return
|
1050
|
+
with connection.cursor() as cursor:
|
1051
|
+
cursor.execute("SHOW TABLES")
|
1052
|
+
tables = cursor.fetchall() # 获取所有数据表
|
1053
|
+
connection.close()
|
1054
|
+
return tables
|
1055
|
+
|
1056
|
+
def table_datas(self, db_name, table_name, date):
|
1057
|
+
"""
|
1058
|
+
获取指定数据表的数据, 按天获取
|
1059
|
+
"""
|
1060
|
+
self.config.update({'database': db_name}) # 添加更新 config 字段
|
1061
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
1062
|
+
if not connection:
|
1063
|
+
return
|
1064
|
+
try:
|
1065
|
+
with connection.cursor() as cursor:
|
1066
|
+
sql = f"SELECT * FROM `{table_name}` WHERE {'日期'} BETWEEN '%s' AND '%s'" % (date, date)
|
1067
|
+
cursor.execute(sql)
|
1284
1068
|
results = cursor.fetchall()
|
1069
|
+
except Exception as e:
|
1070
|
+
logger.error(f'001 {e}')
|
1071
|
+
finally:
|
1072
|
+
connection.close()
|
1073
|
+
return results
|
1285
1074
|
|
1286
|
-
|
1287
|
-
|
1288
|
-
|
1289
|
-
|
1290
|
-
|
1291
|
-
|
1292
|
-
|
1293
|
-
|
1294
|
-
|
1075
|
+
def day_list(self, start_date, end_date):
|
1076
|
+
start_date = pd.to_datetime(start_date)
|
1077
|
+
end_date = pd.to_datetime(end_date)
|
1078
|
+
date_list = []
|
1079
|
+
while start_date <= end_date:
|
1080
|
+
date_list.append(pd.to_datetime(start_date.date()))
|
1081
|
+
start_date += datetime.timedelta(days=1)
|
1082
|
+
return date_list
|
1083
|
+
|
1084
|
+
def rename_column(self):
|
1085
|
+
""" 批量修改数据库的列名 """
|
1086
|
+
"""
|
1087
|
+
# for db_name in ['京东数据2', '推广数据2', '市场数据2', '生意参谋2', '生意经2', '属性设置2',]:
|
1088
|
+
# s = OptimizeDatas(username=username, password=password, host=host, port=port)
|
1089
|
+
# s.db_name = db_name
|
1090
|
+
# s.rename_column()
|
1091
|
+
"""
|
1092
|
+
tables = self.table_list(db_name=self.db_name)
|
1093
|
+
for table_dict in tables:
|
1094
|
+
for key, table_name in table_dict.items():
|
1095
|
+
self.config.update({'database': self.db_name}) # 添加更新 config 字段
|
1096
|
+
self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
|
1097
|
+
if not self.connection:
|
1098
|
+
return
|
1099
|
+
with self.connection.cursor() as cursor:
|
1100
|
+
cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`") # 查询数据表的列信息
|
1101
|
+
columns = cursor.fetchall()
|
1102
|
+
columns = [{column['Field']: column['Type']} for column in columns]
|
1103
|
+
for column in columns:
|
1104
|
+
for key, value in column.items():
|
1105
|
+
if key.endswith('_'):
|
1106
|
+
new_name = re.sub(r'_+$', '', key)
|
1107
|
+
sql = f"ALTER TABLE `{table_name}` CHANGE COLUMN {key} {new_name} {value}"
|
1108
|
+
cursor.execute(sql)
|
1109
|
+
self.connection.commit()
|
1110
|
+
if self.connection:
|
1111
|
+
self.connection.close()
|
1112
|
+
|
1113
|
+
|
1114
|
+
class MySQLUploader:
|
1115
|
+
def __init__(
|
1116
|
+
self,
|
1117
|
+
username: str,
|
1118
|
+
password: str,
|
1119
|
+
host: str = 'localhost',
|
1120
|
+
port: int = 3306,
|
1121
|
+
charset: str = 'utf8mb4',
|
1122
|
+
collation: str = 'utf8mb4_0900_ai_ci',
|
1123
|
+
logging_mode: str = 'console', # 'both'(控制台+文件), 'console'(仅控制台), 'file'(仅文件), 'none'(禁用)
|
1124
|
+
log_level: str = 'INFO', # 默认日志级别
|
1125
|
+
log_file: str = 'mysql_upload.log', # 日志文件路径
|
1126
|
+
max_log_size: int = 50, # 日志文件大小(MB)
|
1127
|
+
backup_count: int = 5, # 保留的日志文件数量
|
1128
|
+
max_retries: int = 10,
|
1129
|
+
retry_interval: int = 10,
|
1130
|
+
pool_size: int = 10,
|
1131
|
+
connect_timeout: int = 10,
|
1132
|
+
read_timeout: int = 30,
|
1133
|
+
write_timeout: int = 30,
|
1134
|
+
ssl: Optional[Dict] = None,
|
1135
|
+
enable_metrics: bool = True # 是否启用性能指标收集
|
1136
|
+
):
|
1137
|
+
"""
|
1138
|
+
:param username: 数据库用户名
|
1139
|
+
:param password: 数据库密码
|
1140
|
+
:param host: 数据库主机地址,默认为localhost
|
1141
|
+
:param port: 数据库端口,默认为3306
|
1142
|
+
:param charset: 字符集,默认为utf8mb4
|
1143
|
+
:param collation: 排序规则,默认为utf8mb4_0900_ai_ci
|
1144
|
+
:param logging_mode: 日志模式,可选 'both'(控制台+文件), 'console'(仅控制台), 'file'(仅文件), 'none'(禁用)
|
1145
|
+
:param log_level: 日志级别,默认为INFO
|
1146
|
+
:param log_file: 日志文件路径
|
1147
|
+
:param max_log_size: 日志文件最大大小(MB),默认为50
|
1148
|
+
:param backup_count: 保留的日志备份数量,默认为5
|
1149
|
+
:param max_retries: 最大重试次数,默认为10
|
1150
|
+
:param retry_interval: 重试间隔(秒),默认为10
|
1151
|
+
:param pool_size: 连接池大小,默认为5
|
1152
|
+
:param connect_timeout: 连接超时(秒),默认为10
|
1153
|
+
:param read_timeout: 读取超时(秒),默认为30
|
1154
|
+
:param write_timeout: 写入超时(秒),默认为30
|
1155
|
+
:param ssl: SSL配置字典,默认为None
|
1156
|
+
:param enable_metrics: 是否启用性能指标收集,默认为True
|
1157
|
+
"""
|
1158
|
+
self.username = username
|
1159
|
+
self.password = password
|
1160
|
+
self.host = host
|
1161
|
+
self.port = port
|
1162
|
+
self.charset = charset
|
1163
|
+
self.collation = collation
|
1164
|
+
self.max_retries = max(max_retries, 1)
|
1165
|
+
self.retry_interval = max(retry_interval, 1)
|
1166
|
+
self.pool_size = max(pool_size, 1)
|
1167
|
+
self.connect_timeout = connect_timeout
|
1168
|
+
self.read_timeout = read_timeout
|
1169
|
+
self.write_timeout = write_timeout
|
1170
|
+
self.ssl = ssl
|
1171
|
+
self._prepared_statements = {}
|
1172
|
+
self._max_cached_statements = 100
|
1173
|
+
self.enable_metrics = enable_metrics
|
1174
|
+
self.metrics = {
|
1175
|
+
'total_uploads': 0,
|
1176
|
+
'successful_uploads': 0,
|
1177
|
+
'failed_uploads': 0,
|
1178
|
+
'total_rows': 0,
|
1179
|
+
'successful_rows': 0,
|
1180
|
+
'failed_rows': 0,
|
1181
|
+
'total_retries': 0,
|
1182
|
+
'total_execution_time': 0.0,
|
1183
|
+
'connection_usage': [],
|
1184
|
+
'memory_usage': [],
|
1185
|
+
'cpu_usage': []
|
1186
|
+
}
|
1187
|
+
|
1188
|
+
# 初始化日志系统
|
1189
|
+
self._init_logging(logging_mode, log_level, log_file, max_log_size, backup_count)
|
1190
|
+
|
1191
|
+
# 创建连接池
|
1192
|
+
self.pool = self._create_connection_pool()
|
1193
|
+
|
1194
|
+
def _init_logging(
|
1195
|
+
self,
|
1196
|
+
logging_mode: str,
|
1197
|
+
log_level: str,
|
1198
|
+
log_file: str,
|
1199
|
+
max_log_size: int,
|
1200
|
+
backup_count: int
|
1201
|
+
):
|
1202
|
+
"""初始化结构化日志配置"""
|
1203
|
+
if logging_mode.lower() == 'none':
|
1204
|
+
self.logger = None
|
1205
|
+
return
|
1206
|
+
|
1207
|
+
valid_levels = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
|
1208
|
+
level = log_level.upper() if log_level.upper() in valid_levels else 'INFO'
|
1209
|
+
|
1210
|
+
# 创建格式化器 - 使用结构化JSON格式
|
1211
|
+
class StructuredFormatter(logging.Formatter):
|
1212
|
+
def format(self, record):
|
1213
|
+
log_data = {
|
1214
|
+
'time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
1215
|
+
'level': record.levelname,
|
1216
|
+
'message': record.getMessage(),
|
1217
|
+
# 'logger': record.name,
|
1218
|
+
'module': record.module,
|
1219
|
+
'line': record.lineno,
|
1220
|
+
# 'process': record.process
|
1221
|
+
}
|
1222
|
+
|
1223
|
+
# 添加异常信息
|
1224
|
+
if record.exc_info:
|
1225
|
+
log_data['exception'] = self.formatException(record.exc_info)
|
1226
|
+
|
1227
|
+
return json.dumps(log_data, ensure_ascii=False)
|
1228
|
+
|
1229
|
+
# 创建日志记录器
|
1230
|
+
self.logger = logging.getLogger('upload')
|
1231
|
+
self.logger.setLevel(level)
|
1232
|
+
|
1233
|
+
# 防止重复添加handler
|
1234
|
+
if self.logger.handlers:
|
1235
|
+
for handler in self.logger.handlers[:]:
|
1236
|
+
self.logger.removeHandler(handler)
|
1237
|
+
|
1238
|
+
formatter = StructuredFormatter()
|
1239
|
+
mode = logging_mode.lower()
|
1240
|
+
|
1241
|
+
# 根据模式添加相应的handler
|
1242
|
+
if mode in ('both', 'console'):
|
1243
|
+
console_handler = logging.StreamHandler()
|
1244
|
+
console_handler.setFormatter(formatter)
|
1245
|
+
self.logger.addHandler(console_handler)
|
1246
|
+
|
1247
|
+
if mode in ('both', 'file'):
|
1248
|
+
file_handler = logging.handlers.RotatingFileHandler(
|
1249
|
+
filename=log_file,
|
1250
|
+
maxBytes=max_log_size * 1024 * 1024,
|
1251
|
+
backupCount=backup_count,
|
1252
|
+
encoding='utf-8'
|
1253
|
+
)
|
1254
|
+
file_handler.setFormatter(formatter)
|
1255
|
+
self.logger.addHandler(file_handler)
|
1256
|
+
|
1257
|
+
def _record_metrics(self, metric_name: str, value: Any = 1, is_timing: bool = False):
|
1258
|
+
"""记录性能指标"""
|
1259
|
+
if not self.enable_metrics:
|
1260
|
+
return
|
1261
|
+
|
1262
|
+
if metric_name not in self.metrics:
|
1263
|
+
self.metrics[metric_name] = []
|
1264
|
+
|
1265
|
+
if is_timing:
|
1266
|
+
# 如果是时间指标,记录时间戳和值
|
1267
|
+
self.metrics[metric_name].append({
|
1268
|
+
'timestamp': datetime.datetime.now().isoformat(),
|
1269
|
+
'value': value
|
1270
|
+
})
|
1271
|
+
else:
|
1272
|
+
# 其他指标直接累加
|
1273
|
+
if isinstance(self.metrics[metric_name], (int, float)):
|
1274
|
+
self.metrics[metric_name] += value
|
1275
|
+
elif isinstance(self.metrics[metric_name], list):
|
1276
|
+
self.metrics[metric_name].append({
|
1277
|
+
'timestamp': datetime.datetime.now().isoformat(),
|
1278
|
+
'value': value
|
1279
|
+
})
|
1280
|
+
|
1281
|
+
def _get_system_metrics(self):
|
1282
|
+
"""获取系统资源使用指标"""
|
1283
|
+
if not self.enable_metrics:
|
1284
|
+
return {}
|
1285
|
+
|
1286
|
+
metrics = {
|
1287
|
+
'memory': psutil.virtual_memory().percent,
|
1288
|
+
'cpu': psutil.cpu_percent(),
|
1289
|
+
}
|
1290
|
+
|
1291
|
+
# 更安全的连接数获取方式
|
1292
|
+
if hasattr(self, 'pool') and self.pool is not None:
|
1293
|
+
try:
|
1294
|
+
# 对于不同的连接池实现可能有不同的属性名
|
1295
|
+
if hasattr(self.pool, '_connections'):
|
1296
|
+
connections = self.pool._connections
|
1297
|
+
metrics['connections'] = len(connections) if hasattr(connections, '__len__') else 0
|
1298
|
+
else:
|
1299
|
+
metrics['connections'] = 0
|
1300
|
+
except Exception:
|
1301
|
+
metrics['connections'] = 0
|
1302
|
+
else:
|
1303
|
+
metrics['connections'] = 0
|
1304
|
+
|
1305
|
+
return metrics
|
1306
|
+
|
1307
|
+
def _log_with_metrics(self, level: str, message: str, extra: Optional[Dict] = None):
|
1308
|
+
"""日志记录"""
|
1309
|
+
if not self.logger:
|
1310
|
+
return
|
1311
|
+
|
1312
|
+
# 记录系统指标
|
1313
|
+
metrics = self._get_system_metrics()
|
1314
|
+
log_extra = {'metrics': metrics}
|
1315
|
+
if extra:
|
1316
|
+
log_extra.update(extra)
|
1317
|
+
|
1318
|
+
getattr(self.logger, level.lower())(message, extra={'extra_data': log_extra})
|
1319
|
+
|
1320
|
+
def _create_connection_pool(self) -> PooledDB:
|
1321
|
+
"""创建数据库连接池"""
|
1322
|
+
start_time = time.time()
|
1323
|
+
self.pool = None
|
1324
|
+
|
1325
|
+
pool_params = {
|
1326
|
+
'creator': pymysql,
|
1327
|
+
'host': self.host,
|
1328
|
+
'port': self.port,
|
1329
|
+
'user': self.username,
|
1330
|
+
'password': self.password,
|
1331
|
+
'charset': self.charset,
|
1332
|
+
'cursorclass': pymysql.cursors.DictCursor,
|
1333
|
+
'maxconnections': self.pool_size,
|
1334
|
+
'ping': 7,
|
1335
|
+
'connect_timeout': self.connect_timeout,
|
1336
|
+
'read_timeout': self.read_timeout,
|
1337
|
+
'write_timeout': self.write_timeout,
|
1338
|
+
'autocommit': False
|
1339
|
+
}
|
1340
|
+
|
1341
|
+
if self.ssl:
|
1342
|
+
required_keys = {'ca', 'cert', 'key'}
|
1343
|
+
if not all(k in self.ssl for k in required_keys):
|
1344
|
+
error_msg = "SSL配置必须包含ca、cert和key"
|
1345
|
+
self._log_with_metrics('error', error_msg)
|
1346
|
+
raise ValueError(error_msg)
|
1347
|
+
pool_params['ssl'] = {
|
1348
|
+
'ca': self.ssl['ca'],
|
1349
|
+
'cert': self.ssl['cert'],
|
1350
|
+
'key': self.ssl['key'],
|
1351
|
+
'check_hostname': self.ssl.get('check_hostname', False)
|
1352
|
+
}
|
1353
|
+
|
1354
|
+
try:
|
1355
|
+
pool = PooledDB(**pool_params)
|
1356
|
+
elapsed = time.time() - start_time
|
1357
|
+
self._record_metrics('connection_pool_creation_time', elapsed, is_timing=True)
|
1358
|
+
self._log_with_metrics('info', "连接池创建成功", {
|
1359
|
+
'pool_size': self.pool_size,
|
1360
|
+
'time_elapsed': elapsed
|
1361
|
+
})
|
1362
|
+
return pool
|
1363
|
+
except Exception as e:
|
1364
|
+
elapsed = time.time() - start_time
|
1365
|
+
self._record_metrics('connection_pool_failures', 1)
|
1366
|
+
self.pool = None
|
1367
|
+
self._log_with_metrics('error', "连接池创建失败", {
|
1368
|
+
'error': str(e),
|
1369
|
+
'time_elapsed': elapsed
|
1370
|
+
})
|
1371
|
+
raise ConnectionError(f"连接池创建失败: {str(e)}")
|
1372
|
+
|
1373
|
+
def _execute_with_retry(self, func):
|
1374
|
+
@wraps(func)
|
1375
|
+
def wrapper(*args, **kwargs):
|
1376
|
+
last_exception = None
|
1377
|
+
start_time = time.time()
|
1378
|
+
operation = func.__name__
|
1379
|
+
|
1380
|
+
self._log_with_metrics('debug', f"开始执行操作: {operation}", {
|
1381
|
+
'attempt': 1,
|
1382
|
+
'max_retries': self.max_retries
|
1383
|
+
})
|
1384
|
+
|
1385
|
+
for attempt in range(self.max_retries):
|
1386
|
+
try:
|
1387
|
+
result = func(*args, **kwargs)
|
1388
|
+
elapsed = time.time() - start_time
|
1389
|
+
|
1390
|
+
if attempt > 0:
|
1391
|
+
self._record_metrics('total_retries', attempt)
|
1392
|
+
self._log_with_metrics('info', "操作成功(重试后)", {
|
1393
|
+
'operation': operation,
|
1394
|
+
'attempts': attempt + 1,
|
1395
|
+
'time_elapsed': elapsed
|
1396
|
+
})
|
1397
|
+
else:
|
1398
|
+
self._log_with_metrics('debug', "操作成功", {
|
1399
|
+
'operation': operation,
|
1400
|
+
'time_elapsed': elapsed
|
1401
|
+
})
|
1402
|
+
|
1403
|
+
return result
|
1404
|
+
|
1405
|
+
except (pymysql.OperationalError, pymysql.err.MySQLError) as e:
|
1406
|
+
last_exception = e
|
1407
|
+
self._record_metrics('database_errors', 1)
|
1295
1408
|
|
1296
|
-
|
1297
|
-
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1409
|
+
# 记录详细的MySQL错误信息
|
1410
|
+
error_details = {
|
1411
|
+
'operation': operation,
|
1412
|
+
'error_code': e.args[0] if e.args else None,
|
1413
|
+
'error_message': e.args[1] if len(e.args) > 1 else None,
|
1414
|
+
'attempt': attempt + 1,
|
1415
|
+
'max_retries': self.max_retries
|
1416
|
+
}
|
1301
1417
|
|
1302
|
-
|
1303
|
-
|
1304
|
-
|
1305
|
-
|
1418
|
+
if attempt < self.max_retries - 1:
|
1419
|
+
wait_time = self.retry_interval * (attempt + 1)
|
1420
|
+
error_details['wait_time'] = wait_time
|
1421
|
+
self._log_with_metrics('warning', "数据库操作失败,准备重试", error_details)
|
1422
|
+
time.sleep(wait_time)
|
1306
1423
|
|
1307
|
-
|
1308
|
-
|
1309
|
-
|
1310
|
-
|
1311
|
-
|
1312
|
-
|
1313
|
-
|
1314
|
-
|
1315
|
-
|
1316
|
-
|
1317
|
-
|
1318
|
-
|
1319
|
-
cursor.execute(sql, tuple(dict_data.values()))
|
1320
|
-
connection.commit() # 提交数据库
|
1321
|
-
connection.close()
|
1322
|
-
return
|
1424
|
+
# 尝试重新连接
|
1425
|
+
try:
|
1426
|
+
self.pool = self._create_connection_pool()
|
1427
|
+
self._log_with_metrics('info', "成功重新建立数据库连接")
|
1428
|
+
except Exception as reconnect_error:
|
1429
|
+
self._log_with_metrics('error', "重连失败", {
|
1430
|
+
'error': str(reconnect_error)
|
1431
|
+
})
|
1432
|
+
else:
|
1433
|
+
elapsed = time.time() - start_time
|
1434
|
+
error_details['time_elapsed'] = elapsed
|
1435
|
+
self._log_with_metrics('error', "操作最终失败", error_details)
|
1323
1436
|
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1437
|
+
except pymysql.IntegrityError as e:
|
1438
|
+
elapsed = time.time() - start_time
|
1439
|
+
self._record_metrics('integrity_errors', 1)
|
1440
|
+
self._log_with_metrics('error', "完整性约束错误", {
|
1441
|
+
'operation': operation,
|
1442
|
+
'time_elapsed': elapsed,
|
1443
|
+
'error_code': e.args[0] if e.args else None,
|
1444
|
+
'error_message': e.args[1] if len(e.args) > 1 else None
|
1445
|
+
})
|
1446
|
+
raise e
|
1332
1447
|
|
1333
|
-
|
1334
|
-
|
1335
|
-
|
1336
|
-
|
1337
|
-
|
1338
|
-
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
k = re.sub(r'_+$', '', k)
|
1346
|
-
if str(v) == '':
|
1347
|
-
v = 0
|
1348
|
-
v = str(v)
|
1349
|
-
v = re.sub('^="|"$', '', v, re.I)
|
1350
|
-
v = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', str(v)) # 移除控制字符
|
1351
|
-
if re.findall(r'^[-+]?\d+\.?\d*%$', v):
|
1352
|
-
v = str(float(v.rstrip("%")) / 100)
|
1448
|
+
except Exception as e:
|
1449
|
+
last_exception = e
|
1450
|
+
elapsed = time.time() - start_time
|
1451
|
+
self._record_metrics('unexpected_errors', 1)
|
1452
|
+
self._log_with_metrics('error', "发生意外错误", {
|
1453
|
+
'operation': operation,
|
1454
|
+
'time_elapsed': elapsed,
|
1455
|
+
'error_type': type(e).__name__,
|
1456
|
+
'error_message': str(e),
|
1457
|
+
'error_args': e.args if hasattr(e, 'args') else None
|
1458
|
+
})
|
1459
|
+
break
|
1353
1460
|
|
1354
|
-
|
1355
|
-
result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
|
1356
|
-
result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
|
1357
|
-
result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
|
1461
|
+
raise last_exception if last_exception else Exception("发生未知错误")
|
1358
1462
|
|
1359
|
-
|
1360
|
-
int_num = otk.is_integer(v) # 判断整数
|
1361
|
-
count_int, count_float = count_decimal_places(v) # 判断小数,返回小数位数
|
1362
|
-
if result1: # 京东sku/spu商品信息
|
1363
|
-
__res_dict.update({k: 'varchar(100)'})
|
1364
|
-
elif k == '日期':
|
1365
|
-
__res_dict.update({k: 'DATE'})
|
1366
|
-
elif k == '更新时间':
|
1367
|
-
__res_dict.update({k: 'TIMESTAMP'})
|
1368
|
-
elif result2: # 小数
|
1369
|
-
__res_dict.update({k: 'decimal(10,4)'})
|
1370
|
-
elif date_type == 1: # 纯日期
|
1371
|
-
__res_dict.update({k: 'DATE'})
|
1372
|
-
elif date_type == 2: # 日期+时间
|
1373
|
-
__res_dict.update({k: 'DATETIME'})
|
1374
|
-
elif int_num:
|
1375
|
-
__res_dict.update({k: 'INT'})
|
1376
|
-
elif count_float > 0:
|
1377
|
-
if count_int + count_float > 10:
|
1378
|
-
# if count_float > 5:
|
1379
|
-
# v = round(float(v), 4)
|
1380
|
-
if count_float >= 6:
|
1381
|
-
__res_dict.update({k: 'decimal(14,6)'})
|
1382
|
-
else:
|
1383
|
-
__res_dict.update({k: 'decimal(14,4)'})
|
1384
|
-
elif count_float >= 6:
|
1385
|
-
__res_dict.update({k: 'decimal(14,6)'})
|
1386
|
-
elif count_float >= 4:
|
1387
|
-
__res_dict.update({k: 'decimal(12,4)'})
|
1388
|
-
else:
|
1389
|
-
__res_dict.update({k: 'decimal(10,2)'})
|
1390
|
-
else:
|
1391
|
-
__res_dict.update({k: 'varchar(255)'})
|
1392
|
-
new_dict_data.update({k: v})
|
1393
|
-
return __res_dict, new_dict_data
|
1463
|
+
return wrapper
|
1394
1464
|
|
1395
|
-
def
|
1396
|
-
"""
|
1397
|
-
|
1398
|
-
|
1399
|
-
|
1400
|
-
|
1401
|
-
|
1402
|
-
|
1403
|
-
|
1404
|
-
result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
|
1405
|
-
result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
|
1465
|
+
def _get_connection(self):
|
1466
|
+
"""从连接池获取连接"""
|
1467
|
+
try:
|
1468
|
+
conn = self.pool.connection()
|
1469
|
+
self._log_with_metrics('debug', "获取数据库连接")
|
1470
|
+
return conn
|
1471
|
+
except Exception as e:
|
1472
|
+
self._log_with_metrics("error", str(e))
|
1473
|
+
raise ConnectionError(f"连接数据库失败: {str(e)}")
|
1406
1474
|
|
1407
|
-
|
1408
|
-
|
1409
|
-
|
1410
|
-
|
1411
|
-
elif result3: # 小数
|
1412
|
-
__res_dict.update({k: 'decimal(12,4)'})
|
1413
|
-
elif result4: # 小数
|
1414
|
-
__res_dict.update({k: 'decimal(12,2)'})
|
1415
|
-
elif k == '日期':
|
1416
|
-
__res_dict.update({k: 'date'})
|
1417
|
-
elif k == '更新时间':
|
1418
|
-
__res_dict.update({k: 'timestamp'})
|
1419
|
-
elif v == 'int64':
|
1420
|
-
__res_dict.update({k: 'int'})
|
1421
|
-
elif v == 'float64':
|
1422
|
-
__res_dict.update({k: 'decimal(10,4)'})
|
1423
|
-
elif v == 'bool':
|
1424
|
-
__res_dict.update({k: 'boolean'})
|
1425
|
-
elif v == 'datetime64[ns]':
|
1426
|
-
__res_dict.update({k: 'datetime'})
|
1427
|
-
else:
|
1428
|
-
__res_dict.update({k: 'varchar(255)'})
|
1429
|
-
return __res_dict, df
|
1475
|
+
def _check_database_exists(self, db_name: str) -> bool:
|
1476
|
+
"""检查数据库是否存在"""
|
1477
|
+
db_name = self._validate_identifier(db_name)
|
1478
|
+
sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
|
1430
1479
|
|
1431
|
-
|
1432
|
-
|
1433
|
-
|
1480
|
+
try:
|
1481
|
+
with self._get_connection() as conn:
|
1482
|
+
with conn.cursor() as cursor:
|
1483
|
+
cursor.execute(sql, (db_name,))
|
1484
|
+
exists = bool(cursor.fetchone())
|
1485
|
+
self._log_with_metrics('debug', f"{db_name} 数据库已存在: {exists}")
|
1486
|
+
return exists
|
1487
|
+
except Exception as e:
|
1488
|
+
self._log_with_metrics('error', f"检查数据库是否存在时出错: {str(e)}")
|
1489
|
+
raise
|
1490
|
+
|
1491
|
+
def _create_database(self, db_name: str):
|
1492
|
+
"""创建数据库"""
|
1493
|
+
db_name = self._validate_identifier(db_name)
|
1494
|
+
sql = f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET {self.charset} COLLATE {self.collation}"
|
1495
|
+
|
1496
|
+
try:
|
1497
|
+
with self._get_connection() as conn:
|
1498
|
+
with conn.cursor() as cursor:
|
1499
|
+
cursor.execute(sql)
|
1500
|
+
conn.commit()
|
1501
|
+
self._log_with_metrics('info', f"{db_name} 数据库已创建")
|
1502
|
+
except Exception as e:
|
1503
|
+
self._log_with_metrics('error', f"{db_name}: 无法创建数据库 {str(e)}")
|
1504
|
+
conn.rollback()
|
1505
|
+
raise
|
1506
|
+
|
1507
|
+
def _get_partition_table_name(self, table_name: str, date_value: str, partition_by: str) -> str:
|
1434
1508
|
"""
|
1435
|
-
|
1436
|
-
|
1437
|
-
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
filename: 用来追踪处理进度,传这个参数是方便定位产生错误的文件
|
1443
|
-
allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
|
1509
|
+
获取分表名称
|
1510
|
+
|
1511
|
+
:param table_name: 基础表名
|
1512
|
+
:param date_value: 日期值
|
1513
|
+
:param partition_by: 分表方式 ('year' 或 'month')
|
1514
|
+
:return: 分表名称
|
1515
|
+
:raises ValueError: 如果日期格式无效或分表方式无效
|
1444
1516
|
"""
|
1445
|
-
|
1446
|
-
|
1447
|
-
|
1448
|
-
|
1449
|
-
|
1450
|
-
|
1451
|
-
|
1452
|
-
|
1453
|
-
|
1454
|
-
return
|
1517
|
+
try:
|
1518
|
+
date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d %H:%M:%S')
|
1519
|
+
except ValueError:
|
1520
|
+
try:
|
1521
|
+
date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d')
|
1522
|
+
except ValueError:
|
1523
|
+
error_msg = f"无效的日期格式: {date_value}"
|
1524
|
+
self._log_with_metrics('error', error_msg)
|
1525
|
+
raise ValueError(error_msg)
|
1455
1526
|
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
return
|
1527
|
+
if partition_by == 'year':
|
1528
|
+
return f"{table_name}_{date_obj.year}"
|
1529
|
+
elif partition_by == 'month':
|
1530
|
+
return f"{table_name}_{date_obj.year}_{date_obj.month:02d}"
|
1461
1531
|
else:
|
1462
|
-
|
1463
|
-
|
1464
|
-
|
1465
|
-
logger.info(f'{db_name} 不能为 None')
|
1466
|
-
return
|
1532
|
+
error_msg = "partition_by must be 'year' or 'month'"
|
1533
|
+
self._log_with_metrics('error', error_msg)
|
1534
|
+
raise ValueError(error_msg)
|
1467
1535
|
|
1468
|
-
|
1469
|
-
|
1470
|
-
|
1471
|
-
|
1472
|
-
min_year = df['日期'].min(skipna=True).year
|
1473
|
-
min_month = df['日期'].min(skipna=True).month
|
1474
|
-
if 0 < int(min_month) < 10 and not str(min_month).startswith('0'):
|
1475
|
-
min_month = f'0{min_month}'
|
1476
|
-
if str(cut_data).lower() == 'year':
|
1477
|
-
table_name = f'{table_name}_{min_year}'
|
1478
|
-
elif str(cut_data).lower() == 'month':
|
1479
|
-
table_name = f'{table_name}_{min_year}-{min_month}'
|
1480
|
-
else:
|
1481
|
-
logger.info(f'参数不正确,cut_data应为 year 或 month ')
|
1482
|
-
except Exception as e:
|
1483
|
-
logger.error(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
|
1484
|
-
# 清理 dataframe 非法值,并转换获取数据类型
|
1485
|
-
dtypes, df = self.convert_df_dtypes(df)
|
1486
|
-
if set_typ:
|
1487
|
-
# 更新自定义的列数据类型
|
1488
|
-
for k, v in dtypes.copy().items():
|
1489
|
-
# 确保传进来的 set_typ 键存在于实际的 df 列才 update
|
1490
|
-
[dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
|
1536
|
+
def _validate_identifier(self, identifier: str) -> str:
|
1537
|
+
"""
|
1538
|
+
验证并清理数据库标识符(数据库名、表名、列名)
|
1539
|
+
防止SQL注入和非法字符
|
1491
1540
|
|
1492
|
-
|
1493
|
-
|
1494
|
-
|
1495
|
-
|
1496
|
-
|
1497
|
-
|
1498
|
-
|
1499
|
-
|
1500
|
-
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
1501
|
-
cursor.execute(sql)
|
1502
|
-
connection.commit()
|
1503
|
-
logger.info(f"创建Database: {db_name}")
|
1541
|
+
:param identifier: 要验证的标识符
|
1542
|
+
:return: 清理后的安全标识符
|
1543
|
+
:raises ValueError: 如果标识符无效
|
1544
|
+
"""
|
1545
|
+
if not identifier or not isinstance(identifier, str):
|
1546
|
+
error_msg = f"无效的标识符: {identifier}"
|
1547
|
+
self._log_with_metrics('error', error_msg)
|
1548
|
+
raise ValueError(error_msg)
|
1504
1549
|
|
1505
|
-
|
1506
|
-
|
1507
|
-
if not
|
1508
|
-
|
1509
|
-
|
1510
|
-
|
1511
|
-
sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
|
1512
|
-
cursor.execute(sql, (table_name,))
|
1513
|
-
if not cursor.fetchone():
|
1514
|
-
create_table_sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY)"
|
1515
|
-
cursor.execute(create_table_sql)
|
1516
|
-
logger.info(f'创建 mysql 表: {table_name}')
|
1550
|
+
# 移除非法字符,只保留字母、数字、下划线和美元符号
|
1551
|
+
cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '', identifier)
|
1552
|
+
if not cleaned:
|
1553
|
+
error_msg = f"无法清理异常标识符: {identifier}"
|
1554
|
+
self._log_with_metrics('error', error_msg)
|
1555
|
+
raise ValueError(error_msg)
|
1517
1556
|
|
1518
|
-
|
1519
|
-
|
1520
|
-
|
1521
|
-
|
1522
|
-
|
1523
|
-
|
1557
|
+
# 检查是否为MySQL保留字
|
1558
|
+
mysql_keywords = {
|
1559
|
+
'select', 'insert', 'update', 'delete', 'from', 'where', 'and', 'or',
|
1560
|
+
'not', 'like', 'in', 'is', 'null', 'true', 'false', 'between'
|
1561
|
+
}
|
1562
|
+
if cleaned.lower() in mysql_keywords:
|
1563
|
+
self._log_with_metrics('debug', f"存在MySQL保留字: {cleaned}")
|
1564
|
+
return f"`{cleaned}`"
|
1524
1565
|
|
1525
|
-
|
1526
|
-
if col_not_exist: # 数据表中不存在的列
|
1527
|
-
for col in col_not_exist:
|
1528
|
-
# 创建列,需转义
|
1529
|
-
alter_sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]}"
|
1530
|
-
if not allow_not_null:
|
1531
|
-
alter_sql += " NOT NULL"
|
1532
|
-
cursor.execute(alter_sql)
|
1533
|
-
logger.info(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
|
1566
|
+
return cleaned
|
1534
1567
|
|
1535
|
-
|
1536
|
-
|
1537
|
-
|
1538
|
-
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1568
|
+
def _check_table_exists(self, db_name: str, table_name: str) -> bool:
|
1569
|
+
"""检查表是否存在"""
|
1570
|
+
db_name = self._validate_identifier(db_name)
|
1571
|
+
table_name = self._validate_identifier(table_name)
|
1572
|
+
sql = """
|
1573
|
+
SELECT TABLE_NAME
|
1574
|
+
FROM INFORMATION_SCHEMA.TABLES
|
1575
|
+
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
1576
|
+
"""
|
1543
1577
|
|
1544
|
-
|
1545
|
-
|
1546
|
-
|
1547
|
-
|
1548
|
-
|
1549
|
-
|
1550
|
-
|
1551
|
-
|
1552
|
-
|
1553
|
-
chunksize=1000,
|
1554
|
-
method='multi'
|
1555
|
-
)
|
1556
|
-
connection.commit() # 提交事务
|
1557
|
-
connection.close()
|
1558
|
-
return
|
1578
|
+
try:
|
1579
|
+
with self._get_connection() as conn:
|
1580
|
+
with conn.cursor() as cursor:
|
1581
|
+
cursor.execute(sql, (db_name, table_name))
|
1582
|
+
exists = bool(cursor.fetchone())
|
1583
|
+
return exists
|
1584
|
+
except Exception as e:
|
1585
|
+
self._log_with_metrics('error', f"检查数据表是否存在时发生未知错误: {e}", )
|
1586
|
+
raise
|
1559
1587
|
|
1560
|
-
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1566
|
-
|
1588
|
+
def _create_table(
|
1589
|
+
self,
|
1590
|
+
db_name: str,
|
1591
|
+
table_name: str,
|
1592
|
+
set_typ: Dict[str, str],
|
1593
|
+
primary_keys: Optional[List[str]] = None,
|
1594
|
+
date_column: Optional[str] = None,
|
1595
|
+
indexes: Optional[List[str]] = None,
|
1596
|
+
allow_null: bool = False
|
1597
|
+
):
|
1598
|
+
"""
|
1599
|
+
创建数据表
|
1567
1600
|
|
1568
|
-
|
1569
|
-
|
1570
|
-
|
1571
|
-
|
1572
|
-
|
1573
|
-
|
1601
|
+
:param db_name: 数据库名
|
1602
|
+
:param table_name: 表名
|
1603
|
+
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
1604
|
+
:param primary_keys: 主键列列表
|
1605
|
+
:param date_column: 日期列名,如果存在将设置为索引
|
1606
|
+
:param indexes: 需要创建索引的列列表
|
1607
|
+
"""
|
1608
|
+
db_name = self._validate_identifier(db_name)
|
1609
|
+
table_name = self._validate_identifier(table_name)
|
1574
1610
|
|
1575
|
-
|
1576
|
-
|
1577
|
-
|
1578
|
-
|
1579
|
-
name=table_name,
|
1580
|
-
con=engine,
|
1581
|
-
if_exists='append',
|
1582
|
-
index=False,
|
1583
|
-
chunksize=1000,
|
1584
|
-
method='multi'
|
1585
|
-
)
|
1586
|
-
return
|
1611
|
+
if not set_typ:
|
1612
|
+
error_msg = "No columns specified for table creation"
|
1613
|
+
self._log_with_metrics('error', error_msg)
|
1614
|
+
raise ValueError(error_msg)
|
1587
1615
|
|
1588
|
-
|
1589
|
-
|
1590
|
-
# data 是传进来待处理的数据, 不是数据库数据
|
1591
|
-
# data 示例: {'日期': Timestamp('2024-08-27 00:00:00'), '推广费余额': 33299, '品销宝余额': 2930.73, '短信剩余': 67471}
|
1592
|
-
try:
|
1593
|
-
# 预处理数据:转换非字符串类型
|
1594
|
-
processed_data = {}
|
1595
|
-
for k, v in data.items():
|
1596
|
-
if isinstance(v, (int, float)):
|
1597
|
-
processed_data[k] = float(v)
|
1598
|
-
elif isinstance(v, pd.Timestamp):
|
1599
|
-
processed_data[k] = v.strftime('%Y-%m-%d')
|
1600
|
-
else:
|
1601
|
-
processed_data[k] = str(v)
|
1616
|
+
# 构建列定义SQL
|
1617
|
+
column_defs = ["`id` INT NOT NULL AUTO_INCREMENT"]
|
1602
1618
|
|
1603
|
-
|
1604
|
-
|
1605
|
-
|
1606
|
-
|
1619
|
+
# 添加其他列定义
|
1620
|
+
for col_name, col_type in set_typ.items():
|
1621
|
+
# 跳过id列,因为已经在前面添加了
|
1622
|
+
if col_name.lower() == 'id':
|
1623
|
+
continue
|
1624
|
+
safe_col_name = self._validate_identifier(col_name)
|
1625
|
+
col_def = f"`{safe_col_name}` {col_type}"
|
1607
1626
|
|
1608
|
-
|
1609
|
-
|
1627
|
+
# 根据allow_null决定是否添加NOT NULL约束
|
1628
|
+
if not allow_null and not col_type.lower().startswith('json'):
|
1629
|
+
col_def += " NOT NULL"
|
1610
1630
|
|
1611
|
-
|
1612
|
-
# 获取数据表结构
|
1613
|
-
cursor.execute(
|
1614
|
-
"SELECT COLUMN_NAME FROM information_schema.columns "
|
1615
|
-
"WHERE table_schema = %s AND table_name = %s",
|
1616
|
-
(db_name, table_name)
|
1617
|
-
)
|
1618
|
-
cols_exist = [row['COLUMN_NAME'] for row in cursor.fetchall()]
|
1619
|
-
update_columns = [col for col in cols_exist if col not in icm_update and col != 'id']
|
1631
|
+
column_defs.append(col_def)
|
1620
1632
|
|
1621
|
-
|
1622
|
-
|
1623
|
-
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1633
|
+
# 添加主键定义
|
1634
|
+
if primary_keys:
|
1635
|
+
# 确保id在主键中
|
1636
|
+
if 'id' not in [pk.lower() for pk in primary_keys]:
|
1637
|
+
primary_keys = ['id'] + primary_keys
|
1638
|
+
else:
|
1639
|
+
# 如果没有指定主键,则使用id作为主键
|
1640
|
+
primary_keys = ['id']
|
1627
1641
|
|
1628
|
-
|
1629
|
-
|
1630
|
-
|
1631
|
-
cursor.execute(select_sql, where_values)
|
1632
|
-
existing_data = cursor.fetchone()
|
1642
|
+
# 添加主键定义
|
1643
|
+
safe_primary_keys = [self._validate_identifier(pk) for pk in primary_keys]
|
1644
|
+
primary_key_sql = f", PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
|
1633
1645
|
|
1634
|
-
|
1635
|
-
|
1636
|
-
|
1637
|
-
|
1638
|
-
|
1639
|
-
|
1640
|
-
|
1646
|
+
# 构建完整SQL
|
1647
|
+
sql = f"""
|
1648
|
+
CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
|
1649
|
+
{','.join(column_defs)}
|
1650
|
+
{primary_key_sql}
|
1651
|
+
) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
|
1652
|
+
"""
|
1641
1653
|
|
1642
|
-
|
1643
|
-
|
1644
|
-
|
1645
|
-
|
1646
|
-
|
1647
|
-
elif db_value != new_value:
|
1648
|
-
update_set.append(f"`{col}` = %s")
|
1649
|
-
update_values.append(new_value)
|
1654
|
+
try:
|
1655
|
+
with self._get_connection() as conn:
|
1656
|
+
with conn.cursor() as cursor:
|
1657
|
+
cursor.execute(sql)
|
1658
|
+
self._log_with_metrics('info', f"{db_name}.{table_name}: 数据表已创建")
|
1650
1659
|
|
1651
|
-
|
1652
|
-
|
1653
|
-
|
1654
|
-
|
1655
|
-
|
1656
|
-
|
1657
|
-
|
1658
|
-
|
1659
|
-
|
1660
|
-
|
1661
|
-
|
1662
|
-
|
1663
|
-
|
1660
|
+
# 添加普通索引
|
1661
|
+
index_statements = []
|
1662
|
+
|
1663
|
+
# 日期列索引
|
1664
|
+
if date_column and date_column in set_typ:
|
1665
|
+
safe_date_col = self._validate_identifier(date_column)
|
1666
|
+
index_statements.append(
|
1667
|
+
f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_date_col}` (`{safe_date_col}`)"
|
1668
|
+
)
|
1669
|
+
|
1670
|
+
# 其他索引
|
1671
|
+
if indexes:
|
1672
|
+
for idx_col in indexes:
|
1673
|
+
if idx_col in set_typ:
|
1674
|
+
safe_idx_col = self._validate_identifier(idx_col)
|
1675
|
+
index_statements.append(
|
1676
|
+
f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)"
|
1677
|
+
)
|
1664
1678
|
|
1679
|
+
# 执行所有索引创建语句
|
1680
|
+
if index_statements:
|
1681
|
+
with conn.cursor() as cursor:
|
1682
|
+
for stmt in index_statements:
|
1683
|
+
cursor.execute(stmt)
|
1684
|
+
self._log_with_metrics('debug', f"Executed index statement: {stmt}", )
|
1665
1685
|
|
1666
|
-
|
1667
|
-
|
1668
|
-
数据维护 删除 mysql 的冗余数据
|
1669
|
-
更新过程:
|
1670
|
-
1. 读取所有数据表
|
1671
|
-
2. 遍历表, 遍历列, 如果存在日期列则按天遍历所有日期, 不存在则全表读取
|
1672
|
-
3. 按天删除所有冗余数据(存在日期列时)
|
1673
|
-
tips: 查找冗余数据的方式是创建一个临时迭代器, 逐行读取数据并添加到迭代器, 出现重复时将重复数据的 id 添加到临时列表, 按列表 id 执行删除
|
1674
|
-
"""
|
1675
|
-
def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
|
1676
|
-
self.username = username
|
1677
|
-
self.password = password
|
1678
|
-
self.host = host
|
1679
|
-
self.port = port # 默认端口, 此后可能更新,不作为必传参数
|
1680
|
-
self.charset = charset
|
1681
|
-
self.config = {
|
1682
|
-
'host': self.host,
|
1683
|
-
'port': int(self.port),
|
1684
|
-
'user': self.username,
|
1685
|
-
'password': self.password,
|
1686
|
-
'charset': self.charset, # utf8mb4 支持存储四字节的UTF-8字符集
|
1687
|
-
'cursorclass': pymysql.cursors.DictCursor,
|
1688
|
-
}
|
1689
|
-
self.db_name_lists: list = [] # 更新多个数据库 删除重复数据
|
1690
|
-
self.db_name = None
|
1691
|
-
self.days: int = 63 # 对近 N 天的数据进行排重
|
1692
|
-
self.end_date = None
|
1693
|
-
self.start_date = None
|
1694
|
-
self.connection = None
|
1686
|
+
conn.commit()
|
1687
|
+
self._log_with_metrics('info', f"{db_name}.{table_name}: 索引已添加")
|
1695
1688
|
|
1696
|
-
|
1697
|
-
|
1689
|
+
except Exception as e:
|
1690
|
+
self._log_with_metrics('error', f"{db_name}.{table_name}: 建表失败: {str(e)}")
|
1691
|
+
conn.rollback()
|
1692
|
+
raise
|
1698
1693
|
|
1699
|
-
|
1700
|
-
|
1694
|
+
def _validate_datetime(self, value):
|
1695
|
+
formats = [
|
1696
|
+
'%Y-%m-%d %H:%M:%S',
|
1697
|
+
'%Y-%m-%d',
|
1698
|
+
'%Y/%m/%d %H:%M:%S',
|
1699
|
+
'%Y/%m/%d',
|
1700
|
+
'%Y%m%d',
|
1701
|
+
'%Y-%m-%dT%H:%M:%S',
|
1702
|
+
'%Y-%m-%d %H:%M:%S.%f'
|
1703
|
+
]
|
1704
|
+
for fmt in formats:
|
1701
1705
|
try:
|
1702
|
-
return
|
1703
|
-
except
|
1704
|
-
|
1705
|
-
|
1706
|
-
return wrapper
|
1706
|
+
return datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d %H:%M:%S')
|
1707
|
+
except ValueError:
|
1708
|
+
continue
|
1709
|
+
raise ValueError(f"无效的日期格式: {value}")
|
1707
1710
|
|
1708
|
-
def
|
1709
|
-
|
1710
|
-
|
1711
|
-
try:
|
1712
|
-
connection = pymysql.connect(**_config) # 连接数据库
|
1713
|
-
return connection
|
1714
|
-
except Exception as e:
|
1715
|
-
logger.error(f'{_db_name}连接失败,正在重试: {self.host}:{self.port} {attempts}/{max_try} {e}')
|
1716
|
-
attempts += 1
|
1717
|
-
time.sleep(30)
|
1718
|
-
logger.error(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
|
1719
|
-
return None
|
1711
|
+
def _validate_value(self, value: Any, column_type: str) -> Any:
|
1712
|
+
"""
|
1713
|
+
验证并清理数据值,根据列类型进行适当转换
|
1720
1714
|
|
1721
|
-
|
1715
|
+
:param value: 要验证的值
|
1716
|
+
:param column_type: 列的数据类型
|
1717
|
+
:return: 清理后的值
|
1718
|
+
:raises ValueError: 如果值转换失败
|
1722
1719
|
"""
|
1723
|
-
|
1724
|
-
|
1720
|
+
if value is None:
|
1721
|
+
return None
|
1722
|
+
|
1723
|
+
try:
|
1724
|
+
column_type_lower = column_type.lower()
|
1725
|
+
|
1726
|
+
if 'int' in column_type_lower:
|
1727
|
+
return int(value) if value is not None else None
|
1728
|
+
elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
|
1729
|
+
return float(value) if value is not None else None
|
1730
|
+
elif '日期' in column_type_lower or 'time' in column_type_lower:
|
1731
|
+
if isinstance(value, (datetime.datetime, pd.Timestamp)):
|
1732
|
+
return value.strftime('%Y-%m-%d %H:%M:%S')
|
1733
|
+
elif isinstance(value, str):
|
1734
|
+
try:
|
1735
|
+
return self._validate_datetime(value) # 使用专门的日期验证方法
|
1736
|
+
except ValueError as e:
|
1737
|
+
raise ValueError(f"无效日期格式: {value} - {str(e)}")
|
1738
|
+
return str(value)
|
1739
|
+
elif 'char' in column_type_lower or 'text' in column_type_lower:
|
1740
|
+
# 防止SQL注入
|
1741
|
+
if isinstance(value, str):
|
1742
|
+
return value.replace('\\', '\\\\').replace("'", "\\'")
|
1743
|
+
return str(value)
|
1744
|
+
elif 'json' in column_type_lower:
|
1745
|
+
import json
|
1746
|
+
return json.dumps(value) if value is not None else None
|
1747
|
+
else:
|
1748
|
+
return value
|
1749
|
+
except (ValueError, TypeError) as e:
|
1750
|
+
error_msg = f"数据类型转换异常 {value} to type {column_type}: {str(e)}"
|
1751
|
+
self._log_with_metrics('error', error_msg)
|
1752
|
+
raise ValueError(error_msg)
|
1753
|
+
|
1754
|
+
def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
|
1755
|
+
"""获取表的列名和数据类型"""
|
1756
|
+
db_name = self._validate_identifier(db_name)
|
1757
|
+
table_name = self._validate_identifier(table_name)
|
1758
|
+
sql = """
|
1759
|
+
SELECT COLUMN_NAME, DATA_TYPE
|
1760
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
1761
|
+
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
1762
|
+
ORDER BY ORDINAL_POSITION
|
1725
1763
|
"""
|
1726
|
-
if not self.db_name_lists:
|
1727
|
-
logger.info(f'尚未设置参数: self.db_name_lists')
|
1728
|
-
return
|
1729
|
-
for db_name in self.db_name_lists:
|
1730
|
-
self.db_name = db_name
|
1731
|
-
self.optimize()
|
1732
1764
|
|
1733
|
-
|
1734
|
-
|
1735
|
-
|
1736
|
-
|
1737
|
-
|
1738
|
-
|
1739
|
-
|
1740
|
-
|
1741
|
-
|
1765
|
+
try:
|
1766
|
+
with self._get_connection() as conn:
|
1767
|
+
with conn.cursor() as cursor:
|
1768
|
+
cursor.execute(sql, (db_name, table_name))
|
1769
|
+
set_typ = {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
|
1770
|
+
self._log_with_metrics('debug', f"{db_name}.{table_name}: 获取表的列信息: {set_typ}")
|
1771
|
+
return set_typ
|
1772
|
+
except Exception as e:
|
1773
|
+
self._log_with_metrics('error', f"无法获取表列信息: {str(e)}")
|
1774
|
+
raise
|
1742
1775
|
|
1743
|
-
|
1744
|
-
|
1745
|
-
|
1746
|
-
|
1747
|
-
|
1748
|
-
|
1749
|
-
|
1750
|
-
|
1751
|
-
|
1752
|
-
|
1753
|
-
|
1754
|
-
|
1755
|
-
|
1776
|
+
def _upload_to_table(
|
1777
|
+
self,
|
1778
|
+
db_name: str,
|
1779
|
+
table_name: str,
|
1780
|
+
data: List[Dict],
|
1781
|
+
set_typ: Dict[str, str],
|
1782
|
+
primary_keys: Optional[List[str]],
|
1783
|
+
check_duplicate: bool,
|
1784
|
+
duplicate_columns: Optional[List[str]],
|
1785
|
+
allow_null: bool,
|
1786
|
+
auto_create: bool,
|
1787
|
+
date_column: Optional[str],
|
1788
|
+
indexes: Optional[List[str]],
|
1789
|
+
batch_id: Optional[str] = None
|
1790
|
+
):
|
1791
|
+
"""实际执行表上传的方法"""
|
1792
|
+
# 检查表是否存在
|
1793
|
+
if not self._check_table_exists(db_name, table_name):
|
1794
|
+
if auto_create:
|
1795
|
+
self._create_table(db_name, table_name, set_typ, primary_keys, date_column, indexes,
|
1796
|
+
allow_null=allow_null)
|
1797
|
+
else:
|
1798
|
+
error_msg = f"数据表不存在: '{db_name}.{table_name}'"
|
1799
|
+
self._log_with_metrics('error', error_msg)
|
1800
|
+
raise ValueError(error_msg)
|
1756
1801
|
|
1757
|
-
|
1758
|
-
|
1759
|
-
|
1760
|
-
|
1761
|
-
|
1762
|
-
|
1763
|
-
return
|
1764
|
-
with self.connection.cursor() as cursor:
|
1765
|
-
sql = f"SELECT 1 FROM `{table_name}` LIMIT 1"
|
1766
|
-
cursor.execute(sql)
|
1767
|
-
result = cursor.fetchone()
|
1768
|
-
if not result:
|
1769
|
-
logger.info(f'数据表: {table_name}, 数据长度为 0')
|
1770
|
-
continue # 检查数据表是否为空
|
1802
|
+
# 获取表结构并验证
|
1803
|
+
table_columns = self._get_table_columns(db_name, table_name)
|
1804
|
+
if not table_columns:
|
1805
|
+
error_msg = f"获取列失败 '{db_name}.{table_name}'"
|
1806
|
+
self._log_with_metrics('error', error_msg)
|
1807
|
+
raise ValueError(error_msg)
|
1771
1808
|
|
1772
|
-
|
1773
|
-
|
1774
|
-
|
1775
|
-
|
1776
|
-
|
1777
|
-
|
1778
|
-
break
|
1779
|
-
if date_exist: # 存在日期列
|
1780
|
-
sql_max = f"SELECT MAX(日期) AS max_date FROM `{table_name}`"
|
1781
|
-
sql_min = f"SELECT MIN(日期) AS min_date FROM `{table_name}`"
|
1782
|
-
cursor.execute(sql_max)
|
1783
|
-
max_result = cursor.fetchone()
|
1784
|
-
cursor.execute(sql_min)
|
1785
|
-
min_result = cursor.fetchone()
|
1786
|
-
# 匹配修改为合适的起始和结束日期
|
1787
|
-
if self.start_date < pd.to_datetime(min_result['min_date']):
|
1788
|
-
self.start_date = pd.to_datetime(min_result['min_date'])
|
1789
|
-
if self.end_date > pd.to_datetime(max_result['max_date']):
|
1790
|
-
self.end_date = pd.to_datetime(max_result['max_date'])
|
1791
|
-
dates_list = self.day_list(start_date=self.start_date, end_date=self.end_date)
|
1792
|
-
# dates_list 是日期列表
|
1793
|
-
for date in dates_list:
|
1794
|
-
self.delete_duplicate(table_name=table_name, date=date, except_key=except_key)
|
1795
|
-
self.start_date = start_date_before # 重置,不然日期错乱
|
1796
|
-
self.end_date = end_date_before
|
1797
|
-
else: # 不存在日期列的情况
|
1798
|
-
self.delete_duplicate2(table_name=table_name, except_key=except_key)
|
1799
|
-
self.connection.close()
|
1800
|
-
logger.info(f'mysql({self.host}: {self.port}) {self.db_name} 数据库优化完成!')
|
1809
|
+
# 验证数据列与表列匹配
|
1810
|
+
for col in set_typ:
|
1811
|
+
if col not in table_columns:
|
1812
|
+
error_msg = f"列不存在: '{col}' -> '{db_name}.{table_name}'"
|
1813
|
+
self._log_with_metrics('error', error_msg)
|
1814
|
+
raise ValueError(error_msg)
|
1801
1815
|
|
1802
|
-
|
1803
|
-
|
1804
|
-
|
1805
|
-
|
1806
|
-
|
1807
|
-
|
1808
|
-
|
1809
|
-
|
1810
|
-
|
1811
|
-
|
1816
|
+
# 插入数据
|
1817
|
+
self._insert_data(
|
1818
|
+
db_name, table_name, data, set_typ,
|
1819
|
+
check_duplicate, duplicate_columns
|
1820
|
+
)
|
1821
|
+
|
1822
|
+
def _prepare_data(
|
1823
|
+
self,
|
1824
|
+
data: Union[Dict, List[Dict], pd.DataFrame],
|
1825
|
+
set_typ: Dict[str, str],
|
1826
|
+
allow_null: bool = False
|
1827
|
+
) -> List[Dict]:
|
1828
|
+
"""
|
1829
|
+
准备要上传的数据,验证并转换数据类型
|
1830
|
+
|
1831
|
+
:param data: 输入数据
|
1832
|
+
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
1833
|
+
:param allow_null: 是否允许空值
|
1834
|
+
:return: 准备好的数据列表
|
1835
|
+
:raises ValueError: 如果数据验证失败
|
1836
|
+
"""
|
1837
|
+
# 统一数据格式为字典列表
|
1838
|
+
if isinstance(data, pd.DataFrame):
|
1812
1839
|
try:
|
1813
|
-
|
1814
|
-
del data['id']
|
1815
|
-
data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
|
1816
|
-
if data in all_datas: # 数据出现重复时
|
1817
|
-
if delete_id:
|
1818
|
-
duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
|
1819
|
-
continue
|
1820
|
-
all_datas.append(data) # 数据没有重复
|
1840
|
+
data = data.replace({pd.NA: None}).to_dict('records')
|
1821
1841
|
except Exception as e:
|
1822
|
-
|
1823
|
-
|
1842
|
+
self._log_with_metrics("error", f"转为为字典时发生错误: {e}", )
|
1843
|
+
raise ValueError(f"转为为字典时发生错误: {e}")
|
1844
|
+
elif isinstance(data, dict):
|
1845
|
+
data = [data]
|
1846
|
+
elif not isinstance(data, list) or not all(isinstance(item, dict) for item in data):
|
1847
|
+
error_msg = "Data must be a dict, list of dicts, or DataFrame"
|
1848
|
+
self._log_with_metrics('error', error_msg)
|
1849
|
+
raise ValueError(error_msg)
|
1824
1850
|
|
1825
|
-
|
1826
|
-
|
1851
|
+
prepared_data = []
|
1852
|
+
for row_idx, row in enumerate(data, 1):
|
1853
|
+
prepared_row = {}
|
1854
|
+
for col_name, col_type in set_typ.items():
|
1855
|
+
# 跳过id列,不允许外部传入id
|
1856
|
+
if col_name.lower() == 'id':
|
1857
|
+
continue
|
1827
1858
|
|
1828
|
-
|
1829
|
-
|
1830
|
-
|
1831
|
-
|
1832
|
-
|
1833
|
-
|
1834
|
-
|
1835
|
-
|
1836
|
-
|
1837
|
-
|
1838
|
-
|
1859
|
+
if col_name not in row:
|
1860
|
+
if not allow_null:
|
1861
|
+
error_msg = f"Row {row_idx}: Missing required column '{col_name}' in data"
|
1862
|
+
self._log_with_metrics('error', error_msg)
|
1863
|
+
raise ValueError(error_msg)
|
1864
|
+
prepared_row[col_name] = None
|
1865
|
+
else:
|
1866
|
+
try:
|
1867
|
+
prepared_row[col_name] = self._validate_value(row[col_name], col_type)
|
1868
|
+
except ValueError as e:
|
1869
|
+
error_msg = f"Row {row_idx}, column '{col_name}': {str(e)}"
|
1870
|
+
self._log_with_metrics('error', error_msg)
|
1871
|
+
raise ValueError(error_msg)
|
1872
|
+
prepared_data.append(prepared_row)
|
1839
1873
|
|
1840
|
-
|
1841
|
-
|
1842
|
-
sql = f"SELECT * FROM `{table_name}`" # 如果不包含日期列,则获取全部数据
|
1843
|
-
cursor.execute(sql)
|
1844
|
-
datas = cursor.fetchall()
|
1845
|
-
if not datas:
|
1846
|
-
return
|
1847
|
-
duplicate_id = [] # 出现重复的 id
|
1848
|
-
all_datas = [] # 迭代器
|
1849
|
-
for data in datas:
|
1850
|
-
for e_key in except_key:
|
1851
|
-
if e_key in data.keys(): # 在检查重复数据时,不包含 更新时间 字段
|
1852
|
-
del data[e_key]
|
1853
|
-
delete_id = data['id']
|
1854
|
-
del data['id']
|
1855
|
-
data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
|
1856
|
-
if data in all_datas: # 数据出现重复时
|
1857
|
-
duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
|
1858
|
-
continue
|
1859
|
-
all_datas.append(data) # 数据没有重复
|
1860
|
-
del all_datas
|
1874
|
+
self._log_with_metrics('debug', f"已准备 {len(prepared_data)} 行数据")
|
1875
|
+
return prepared_data
|
1861
1876
|
|
1862
|
-
|
1863
|
-
|
1877
|
+
def upload_data(
|
1878
|
+
self,
|
1879
|
+
db_name: str,
|
1880
|
+
table_name: str,
|
1881
|
+
data: Union[Dict, List[Dict], pd.DataFrame],
|
1882
|
+
set_typ: Dict[str, str],
|
1883
|
+
primary_keys: Optional[List[str]] = None,
|
1884
|
+
check_duplicate: bool = False,
|
1885
|
+
duplicate_columns: Optional[List[str]] = None,
|
1886
|
+
allow_null: bool = False,
|
1887
|
+
partition_by: Optional[str] = None,
|
1888
|
+
partition_date_column: str = '日期',
|
1889
|
+
auto_create: bool = True,
|
1890
|
+
indexes: Optional[List[str]] = None
|
1891
|
+
):
|
1892
|
+
"""
|
1893
|
+
上传数据到数据库
|
1894
|
+
"""
|
1895
|
+
upload_start = time.time()
|
1896
|
+
self._record_metrics('total_uploads', 1)
|
1897
|
+
initial_row_count = len(data) if hasattr(data, '__len__') else 1
|
1898
|
+
self.metrics['total_rows'] += len(data) if hasattr(data, '__len__') else 1
|
1899
|
+
|
1900
|
+
batch_id = f"batch_{int(time.time() * 1000)}"
|
1901
|
+
success_flag = False
|
1902
|
+
|
1903
|
+
self._log_with_metrics('info', "开始上传数据", {
|
1904
|
+
'batch_id': batch_id,
|
1905
|
+
'database': db_name,
|
1906
|
+
'table': table_name,
|
1907
|
+
'partition_by': partition_by,
|
1908
|
+
'check_duplicate': check_duplicate,
|
1909
|
+
'row_count': len(data) if hasattr(data, '__len__') else 1,
|
1910
|
+
'auto_create': auto_create
|
1911
|
+
})
|
1864
1912
|
|
1865
1913
|
try:
|
1866
|
-
|
1867
|
-
|
1868
|
-
|
1869
|
-
|
1870
|
-
|
1871
|
-
logger.info(f"{table_name} -> before: {len(datas)}, "
|
1872
|
-
f"remove: {cursor.rowcount}")
|
1873
|
-
self.connection.commit() # 提交事务
|
1874
|
-
except Exception as e:
|
1875
|
-
logger.error(f'{self.db_name}/{table_name}, {e}')
|
1876
|
-
self.connection.rollback() # 异常则回滚
|
1914
|
+
# 验证参数
|
1915
|
+
if not set_typ:
|
1916
|
+
error_msg = "必须指定列定义"
|
1917
|
+
self._log_with_metrics('error', error_msg)
|
1918
|
+
raise ValueError(error_msg)
|
1877
1919
|
|
1878
|
-
|
1879
|
-
|
1880
|
-
|
1881
|
-
|
1882
|
-
|
1883
|
-
|
1884
|
-
|
1885
|
-
|
1886
|
-
|
1887
|
-
|
1920
|
+
if partition_by and partition_by not in ['year', 'month']:
|
1921
|
+
error_msg = "分表方式必须是 'year' 或 'month'"
|
1922
|
+
self._log_with_metrics('error', error_msg)
|
1923
|
+
raise ValueError(error_msg)
|
1924
|
+
|
1925
|
+
# 准备数据
|
1926
|
+
prepared_data = self._prepare_data(data, set_typ, allow_null)
|
1927
|
+
|
1928
|
+
# 检查数据库是否存在
|
1929
|
+
if not self._check_database_exists(db_name):
|
1930
|
+
if auto_create:
|
1931
|
+
self._create_database(db_name)
|
1932
|
+
else:
|
1933
|
+
error_msg = f"数据库不存在: '{db_name}'"
|
1934
|
+
self._log_with_metrics('error', error_msg)
|
1935
|
+
raise ValueError(error_msg)
|
1936
|
+
|
1937
|
+
# 处理分表逻辑
|
1938
|
+
if partition_by:
|
1939
|
+
partitioned_data = {}
|
1940
|
+
for row in prepared_data:
|
1941
|
+
try:
|
1942
|
+
if partition_date_column not in row:
|
1943
|
+
error_msg = f"异常缺失列 '{partition_date_column}'"
|
1944
|
+
self._log_with_metrics('error', error_msg)
|
1945
|
+
continue # 跳过当前行
|
1946
|
+
|
1947
|
+
part_table = self._get_partition_table_name(
|
1948
|
+
table_name,
|
1949
|
+
str(row[partition_date_column]),
|
1950
|
+
partition_by
|
1951
|
+
)
|
1952
|
+
if part_table not in partitioned_data:
|
1953
|
+
partitioned_data[part_table] = []
|
1954
|
+
partitioned_data[part_table].append(row)
|
1955
|
+
except Exception as e:
|
1956
|
+
self._log_with_metrics('error', "分表处理失败", {
|
1957
|
+
'row_data': row,
|
1958
|
+
'error': str(e)
|
1959
|
+
})
|
1960
|
+
continue # 跳过当前行
|
1961
|
+
|
1962
|
+
# 对每个分表执行上传
|
1963
|
+
for part_table, part_data in partitioned_data.items():
|
1964
|
+
try:
|
1965
|
+
self._upload_to_table(
|
1966
|
+
db_name, part_table, part_data, set_typ,
|
1967
|
+
primary_keys, check_duplicate, duplicate_columns,
|
1968
|
+
allow_null, auto_create, partition_date_column,
|
1969
|
+
indexes, batch_id
|
1970
|
+
)
|
1971
|
+
except Exception as e:
|
1972
|
+
self._log_with_metrics('error', "分表上传失败", {
|
1973
|
+
'partition_table': part_table,
|
1974
|
+
'error': str(e)
|
1975
|
+
})
|
1976
|
+
continue # 跳过当前分表,继续处理其他分表
|
1977
|
+
else:
|
1978
|
+
# 不分表,直接上传
|
1979
|
+
self._upload_to_table(
|
1980
|
+
db_name, table_name, prepared_data, set_typ,
|
1981
|
+
primary_keys, check_duplicate, duplicate_columns,
|
1982
|
+
allow_null, auto_create, partition_date_column,
|
1983
|
+
indexes, batch_id
|
1984
|
+
)
|
1985
|
+
|
1986
|
+
success_flag = True
|
1888
1987
|
|
1889
|
-
def table_list(self, db_name):
|
1890
|
-
""" 获取指定数据库的所有数据表 """
|
1891
|
-
connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
|
1892
|
-
if not connection:
|
1893
|
-
return
|
1894
|
-
try:
|
1895
|
-
with connection.cursor() as cursor:
|
1896
|
-
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
1897
|
-
database_exists = cursor.fetchone()
|
1898
|
-
if not database_exists:
|
1899
|
-
logger.info(f'{db_name}: 数据表不存在!')
|
1900
|
-
return
|
1901
1988
|
except Exception as e:
|
1902
|
-
|
1903
|
-
|
1989
|
+
self._log_with_metrics('error', "上传过程中发生全局错误", {
|
1990
|
+
'error': str(e),
|
1991
|
+
'error_type': type(e).__name__
|
1992
|
+
})
|
1904
1993
|
finally:
|
1905
|
-
|
1994
|
+
elapsed = time.time() - upload_start
|
1995
|
+
self._record_metrics('upload_execution_time', elapsed, is_timing=True)
|
1906
1996
|
|
1907
|
-
|
1908
|
-
|
1909
|
-
|
1910
|
-
|
1911
|
-
with connection.cursor() as cursor:
|
1912
|
-
cursor.execute("SHOW TABLES")
|
1913
|
-
tables = cursor.fetchall() # 获取所有数据表
|
1914
|
-
connection.close()
|
1915
|
-
return tables
|
1997
|
+
if success_flag:
|
1998
|
+
self._record_metrics('successful_uploads', 1)
|
1999
|
+
else:
|
2000
|
+
self._record_metrics('failed_uploads', 1)
|
1916
2001
|
|
1917
|
-
|
2002
|
+
self._log_with_metrics('info', "上传处理完成", {
|
2003
|
+
'batch_id': batch_id,
|
2004
|
+
'success': success_flag,
|
2005
|
+
'time_elapsed': elapsed,
|
2006
|
+
'initial_row_count': initial_row_count,
|
2007
|
+
'processed_rows': self.metrics['successful_rows'] + self.metrics['failed_rows']
|
2008
|
+
})
|
2009
|
+
|
2010
|
+
def _insert_data(
|
2011
|
+
self,
|
2012
|
+
db_name: str,
|
2013
|
+
table_name: str,
|
2014
|
+
data: List[Dict],
|
2015
|
+
set_typ: Dict[str, str],
|
2016
|
+
check_duplicate: bool = False,
|
2017
|
+
duplicate_columns: Optional[List[str]] = None,
|
2018
|
+
batch_size: int = 1000,
|
2019
|
+
batch_id: Optional[str] = None
|
2020
|
+
):
|
1918
2021
|
"""
|
1919
|
-
|
2022
|
+
插入数据到表中,增强日志记录和性能监控
|
1920
2023
|
"""
|
1921
|
-
|
1922
|
-
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
1923
|
-
if not connection:
|
2024
|
+
if not data:
|
1924
2025
|
return
|
2026
|
+
|
2027
|
+
# 获取所有列名(排除id列)
|
2028
|
+
all_columns = [col for col in set_typ.keys() if col.lower() != 'id']
|
2029
|
+
safe_columns = [self._validate_identifier(col) for col in all_columns]
|
2030
|
+
placeholders = ','.join(['%s'] * len(safe_columns))
|
2031
|
+
|
2032
|
+
# 构建基础SQL语句
|
2033
|
+
if check_duplicate:
|
2034
|
+
if not duplicate_columns:
|
2035
|
+
duplicate_columns = all_columns
|
2036
|
+
|
2037
|
+
safe_dup_columns = [self._validate_identifier(col) for col in duplicate_columns]
|
2038
|
+
conditions = [f"`{col}` = %s" for col in safe_dup_columns]
|
2039
|
+
where_clause = " AND ".join(conditions)
|
2040
|
+
|
2041
|
+
sql = f"""
|
2042
|
+
INSERT INTO `{db_name}`.`{table_name}`
|
2043
|
+
(`{'`,`'.join(safe_columns)}`)
|
2044
|
+
SELECT {placeholders}
|
2045
|
+
FROM DUAL
|
2046
|
+
WHERE NOT EXISTS (
|
2047
|
+
SELECT 1 FROM `{db_name}`.`{table_name}`
|
2048
|
+
WHERE {where_clause}
|
2049
|
+
)
|
2050
|
+
"""
|
2051
|
+
else:
|
2052
|
+
sql = f"""
|
2053
|
+
INSERT INTO `{db_name}`.`{table_name}`
|
2054
|
+
(`{'`,`'.join(safe_columns)}`)
|
2055
|
+
VALUES ({placeholders})
|
2056
|
+
"""
|
2057
|
+
|
2058
|
+
total_inserted = 0
|
2059
|
+
total_skipped = 0
|
2060
|
+
total_failed = 0 # 失败计数器
|
2061
|
+
|
2062
|
+
# 分批插入数据
|
2063
|
+
with self._get_connection() as conn:
|
2064
|
+
with conn.cursor() as cursor:
|
2065
|
+
for i in range(0, len(data), batch_size):
|
2066
|
+
batch_start = time.time()
|
2067
|
+
batch = data[i:i + batch_size]
|
2068
|
+
successful_rows = 0 # 当前批次成功数
|
2069
|
+
|
2070
|
+
for row in batch:
|
2071
|
+
try:
|
2072
|
+
row_values = [row.get(col) for col in all_columns]
|
2073
|
+
if check_duplicate:
|
2074
|
+
row_values += [row.get(col) for col in duplicate_columns]
|
2075
|
+
|
2076
|
+
cursor.execute(sql, row_values)
|
2077
|
+
successful_rows += 1
|
2078
|
+
conn.commit() # 每次成功插入后提交
|
2079
|
+
|
2080
|
+
except Exception as e:
|
2081
|
+
conn.rollback() # 回滚当前行的事务
|
2082
|
+
total_failed += 1
|
2083
|
+
|
2084
|
+
# 记录失败行详细信息
|
2085
|
+
error_details = {
|
2086
|
+
'batch_id': batch_id,
|
2087
|
+
'database': db_name,
|
2088
|
+
'table': table_name,
|
2089
|
+
'row_data': row,
|
2090
|
+
'error_type': type(e).__name__,
|
2091
|
+
'error_message': str(e)
|
2092
|
+
}
|
2093
|
+
self._log_with_metrics('error', "单行插入失败", error_details)
|
2094
|
+
continue # 跳过当前行,继续处理下一行
|
2095
|
+
|
2096
|
+
# 更新统计信息
|
2097
|
+
if check_duplicate:
|
2098
|
+
cursor.execute("SELECT ROW_COUNT()")
|
2099
|
+
affected_rows = cursor.rowcount
|
2100
|
+
total_inserted += affected_rows
|
2101
|
+
total_skipped += len(batch) - affected_rows - (len(batch) - successful_rows)
|
2102
|
+
else:
|
2103
|
+
total_inserted += successful_rows
|
2104
|
+
|
2105
|
+
batch_elapsed = time.time() - batch_start
|
2106
|
+
self._record_metrics('batch_execution_time', batch_elapsed, is_timing=True)
|
2107
|
+
|
2108
|
+
batch_info = {
|
2109
|
+
'batch_id': batch_id,
|
2110
|
+
'batch_index': i // batch_size + 1,
|
2111
|
+
'total_batches': (len(data) + batch_size - 1) // batch_size,
|
2112
|
+
'batch_size': len(batch),
|
2113
|
+
'successful_rows': successful_rows,
|
2114
|
+
'failed_rows': len(batch) - successful_rows,
|
2115
|
+
'time_elapsed': batch_elapsed,
|
2116
|
+
'rows_per_second': successful_rows / batch_elapsed if batch_elapsed > 0 else 0
|
2117
|
+
}
|
2118
|
+
self._log_with_metrics('debug', "批次处理完成", batch_info)
|
2119
|
+
|
2120
|
+
# 更新全局指标
|
2121
|
+
self.metrics['failed_rows'] += total_failed
|
2122
|
+
self._log_with_metrics('info', "数据插入完成", {
|
2123
|
+
'total_rows': len(data),
|
2124
|
+
'inserted_rows': total_inserted,
|
2125
|
+
'skipped_rows': total_skipped,
|
2126
|
+
'failed_rows': total_failed
|
2127
|
+
})
|
2128
|
+
|
2129
|
+
def get_metrics(self) -> Dict:
|
2130
|
+
"""获取当前性能指标"""
|
2131
|
+
metrics = self.metrics.copy()
|
2132
|
+
|
2133
|
+
# 添加当前系统指标
|
2134
|
+
metrics.update({
|
2135
|
+
'current_time': datetime.datetime.now().isoformat(),
|
2136
|
+
'system': self._get_system_metrics(),
|
2137
|
+
'connection_pool': {
|
2138
|
+
'size': self.pool_size,
|
2139
|
+
'active': len(self.pool._connections) if hasattr(self.pool, '_connections') else 0
|
2140
|
+
}
|
2141
|
+
})
|
2142
|
+
|
2143
|
+
return metrics
|
2144
|
+
|
2145
|
+
def close(self):
|
2146
|
+
"""关闭连接池并记录最终指标"""
|
2147
|
+
close_start = time.time()
|
2148
|
+
|
1925
2149
|
try:
|
1926
|
-
|
1927
|
-
|
1928
|
-
|
1929
|
-
|
2150
|
+
if hasattr(self, 'pool') and self.pool is not None:
|
2151
|
+
# 记录关闭前的连接池状态
|
2152
|
+
active_connections = self._get_system_metrics().get('connections', 0)
|
2153
|
+
|
2154
|
+
# 更安全的关闭方式
|
2155
|
+
try:
|
2156
|
+
self.pool.close()
|
2157
|
+
except Exception as e:
|
2158
|
+
self._log_with_metrics('warning', "关闭连接池时出错", {
|
2159
|
+
'error': str(e)
|
2160
|
+
})
|
2161
|
+
|
2162
|
+
self.pool = None
|
2163
|
+
|
2164
|
+
elapsed = time.time() - close_start
|
2165
|
+
self._log_with_metrics('info', "连接池已关闭", {
|
2166
|
+
'active_connections_before_close': active_connections,
|
2167
|
+
'close_time_elapsed': elapsed
|
2168
|
+
})
|
1930
2169
|
except Exception as e:
|
1931
|
-
|
2170
|
+
elapsed = time.time() - close_start
|
2171
|
+
self._log_with_metrics('error', "关闭连接池失败", {
|
2172
|
+
'error': str(e),
|
2173
|
+
'close_time_elapsed': elapsed
|
2174
|
+
})
|
2175
|
+
raise
|
1932
2176
|
finally:
|
1933
|
-
|
1934
|
-
|
1935
|
-
|
1936
|
-
def day_list(self, start_date, end_date):
|
1937
|
-
start_date = pd.to_datetime(start_date)
|
1938
|
-
end_date = pd.to_datetime(end_date)
|
1939
|
-
date_list = []
|
1940
|
-
while start_date <= end_date:
|
1941
|
-
date_list.append(pd.to_datetime(start_date.date()))
|
1942
|
-
start_date += datetime.timedelta(days=1)
|
1943
|
-
return date_list
|
2177
|
+
# 记录最终性能指标
|
2178
|
+
if hasattr(self, 'logger') and self.logger and self.enable_metrics:
|
2179
|
+
self._log_with_metrics('debug', "最终性能指标", self.get_metrics())
|
1944
2180
|
|
1945
|
-
|
1946
|
-
|
1947
|
-
"""
|
1948
|
-
# for db_name in ['京东数据2', '推广数据2', '市场数据2', '生意参谋2', '生意经2', '属性设置2',]:
|
1949
|
-
# s = OptimizeDatas(username=username, password=password, host=host, port=port)
|
1950
|
-
# s.db_name = db_name
|
1951
|
-
# s.rename_column()
|
1952
|
-
"""
|
1953
|
-
tables = self.table_list(db_name=self.db_name)
|
1954
|
-
for table_dict in tables:
|
1955
|
-
for key, table_name in table_dict.items():
|
1956
|
-
self.config.update({'database': self.db_name}) # 添加更新 config 字段
|
1957
|
-
self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
|
1958
|
-
if not self.connection:
|
1959
|
-
return
|
1960
|
-
with self.connection.cursor() as cursor:
|
1961
|
-
cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`") # 查询数据表的列信息
|
1962
|
-
columns = cursor.fetchall()
|
1963
|
-
columns = [{column['Field']: column['Type']} for column in columns]
|
1964
|
-
for column in columns:
|
1965
|
-
for key, value in column.items():
|
1966
|
-
if key.endswith('_'):
|
1967
|
-
new_name = re.sub(r'_+$', '', key)
|
1968
|
-
sql = f"ALTER TABLE `{table_name}` CHANGE COLUMN {key} {new_name} {value}"
|
1969
|
-
cursor.execute(sql)
|
1970
|
-
self.connection.commit()
|
1971
|
-
if self.connection:
|
1972
|
-
self.connection.close()
|
2181
|
+
def __main__():
|
2182
|
+
pass
|
1973
2183
|
|
1974
2184
|
|
1975
2185
|
if __name__ == '__main__':
|
@@ -1981,12 +2191,12 @@ if __name__ == '__main__':
|
|
1981
2191
|
password='1',
|
1982
2192
|
host='localhost',
|
1983
2193
|
port=3306,
|
1984
|
-
|
1985
|
-
log_level='
|
2194
|
+
logging_mode='both',
|
2195
|
+
log_level='info'
|
1986
2196
|
)
|
1987
2197
|
|
1988
2198
|
# 定义列和数据类型
|
1989
|
-
|
2199
|
+
set_typ = {
|
1990
2200
|
'id': 'INT',
|
1991
2201
|
'name': 'VARCHAR(255)',
|
1992
2202
|
'age': 'INT',
|
@@ -1996,23 +2206,25 @@ if __name__ == '__main__':
|
|
1996
2206
|
|
1997
2207
|
# 准备数据
|
1998
2208
|
data = [
|
1999
|
-
{'
|
2000
|
-
{'
|
2001
|
-
{'
|
2209
|
+
{'日期': '2023-01-15', 'name': 'Alice', 'age': 35, 'salary': 100},
|
2210
|
+
{'日期': '2023-01-15', 'name': 'Alice', 'age': 30, 'salary': 0.0},
|
2211
|
+
{'日期': '2023-02-20', 'name': 'Bob', 'age': 25, 'salary': 45000.75}
|
2002
2212
|
]
|
2003
2213
|
|
2004
2214
|
# 上传数据
|
2005
2215
|
uploader.upload_data(
|
2006
|
-
db_name='
|
2007
|
-
table_name='
|
2216
|
+
db_name='测试库',
|
2217
|
+
table_name='测试表',
|
2008
2218
|
data=data,
|
2009
|
-
|
2010
|
-
primary_keys=[],
|
2011
|
-
check_duplicate=True,
|
2012
|
-
|
2013
|
-
|
2014
|
-
|
2015
|
-
|
2219
|
+
set_typ=set_typ, # 定义列和数据类型
|
2220
|
+
primary_keys=[], # 指定主键
|
2221
|
+
check_duplicate=True, # 检查重复数据
|
2222
|
+
duplicate_columns=['name', 'age'], #
|
2223
|
+
allow_null=False, # 允许插入空值
|
2224
|
+
partition_by='year', # 按月分表
|
2225
|
+
partition_date_column = '日期', # 用于分表的日期列名,默认为'日期'
|
2226
|
+
auto_create = True, # 表不存在时自动创建, 默认参数不要更改
|
2227
|
+
indexes = ['name'], # 指定索引列
|
2016
2228
|
)
|
2017
2229
|
|
2018
2230
|
# 关闭上传器
|