mdbq 3.9.1__py3-none-any.whl → 3.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/mysql/mysql.py CHANGED
@@ -10,9 +10,15 @@ import pandas as pd
10
10
  from sqlalchemy import create_engine
11
11
  import os
12
12
  import logging
13
+ import logging.handlers
13
14
  from mdbq.other import otk
14
- from dbutils.pooled_db import PooledDB
15
15
  from typing import Union, List, Dict, Optional, Any, Tuple
16
+ from dbutils.pooled_db import PooledDB
17
+ import json
18
+ import psutil
19
+ from collections import OrderedDict
20
+
21
+
16
22
  warnings.filterwarnings('ignore')
17
23
  """
18
24
  建表流程:
@@ -44,933 +50,451 @@ def count_decimal_places(num_str):
44
50
  return 0, 0
45
51
 
46
52
 
47
- class MySQLUploader:
48
- def __init__(
49
- self,
50
- username: str,
51
- password: str,
52
- host: str = 'localhost',
53
- port: int = 3306,
54
- charset: str = 'utf8mb4',
55
- collation: str = 'utf8mb4_0900_ai_ci',
56
- enable_logging: bool = False,
57
- log_level: str = 'ERROR',
58
- max_retries: int = 10,
59
- retry_interval: int = 10,
60
- pool_size: int = 5,
61
- connect_timeout: int = 10,
62
- read_timeout: int = 30,
63
- write_timeout: int = 30,
64
- ssl: Optional[Dict] = None
65
- ):
66
- """
67
- 初始化MySQL上传工具
68
-
69
- :param username: 数据库用户名
70
- :param password: 数据库密码
71
- :param host: 数据库主机地址,默认为localhost
72
- :param port: 数据库端口,默认为3306
73
- :param charset: 字符集,默认为utf8mb4
74
- :param collation: 排序规则,默认为utf8mb4_0900_ai_ci
75
- :param enable_logging: 是否启用日志,默认为False
76
- :param log_level: 日志级别,默认为ERROR
77
- :param max_retries: 最大重试次数,默认为10
78
- :param retry_interval: 重试间隔(秒),默认为10
79
- :param pool_size: 连接池大小,默认为5
80
- :param connect_timeout: 连接超时(秒),默认为10
81
- :param read_timeout: 读取超时(秒),默认为30
82
- :param write_timeout: 写入超时(秒),默认为30
83
- :param ssl: SSL配置字典,默认为None
84
- """
53
+ class MysqlUpload:
54
+ def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
85
55
  self.username = username
86
56
  self.password = password
87
57
  self.host = host
88
58
  self.port = port
89
- self.charset = charset
90
- self.collation = collation
91
- self.max_retries = max(max_retries, 1) # 至少重试1次
92
- self.retry_interval = max(retry_interval, 1) # 至少间隔1秒
93
- self.pool_size = max(pool_size, 1) # 至少1个连接
94
- self.connect_timeout = connect_timeout
95
- self.read_timeout = read_timeout
96
- self.write_timeout = write_timeout
97
- self.ssl = ssl
98
- self._prepared_statements = {} # 预处理语句缓存
99
- self._max_cached_statements = 100 # 最大缓存语句数
100
-
101
- # 初始化日志
102
- if enable_logging:
103
- self._init_logging(log_level)
59
+ if username == '' or password == '' or host == '' or port == 0:
60
+ self.config = None
104
61
  else:
105
- self.logger = None
106
-
107
- # 创建连接池
108
- self.pool = self._create_connection_pool()
109
-
110
- def _init_logging(self, log_level: str):
111
- """初始化日志配置"""
112
- valid_levels = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
113
- level = log_level.upper() if log_level.upper() in valid_levels else 'ERROR'
114
-
115
- logging.basicConfig(
116
- level=getattr(logging, level),
117
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
118
- handlers=[logging.StreamHandler()]
119
- )
120
- self.logger = logging.getLogger('MySQLUploader')
121
-
122
- def _create_connection_pool(self) -> PooledDB:
123
- """创建数据库连接池"""
124
- pool_params = {
125
- 'creator': pymysql,
126
- 'host': self.host,
127
- 'port': self.port,
128
- 'user': self.username,
129
- 'password': self.password,
130
- 'charset': self.charset,
131
- 'cursorclass': pymysql.cursors.DictCursor,
132
- 'maxconnections': self.pool_size,
133
- 'ping': 7, # 连接检查
134
- 'connect_timeout': self.connect_timeout,
135
- 'read_timeout': self.read_timeout,
136
- 'write_timeout': self.write_timeout,
137
- 'autocommit': False
138
- }
139
-
140
- if self.ssl:
141
- required_keys = {'ca', 'cert', 'key'}
142
- if not all(k in self.ssl for k in required_keys):
143
- raise ValueError("SSL配置必须包含ca、cert和key")
144
- pool_params['ssl'] = {
145
- 'ca': self.ssl['ca'],
146
- 'cert': self.ssl['cert'],
147
- 'key': self.ssl['key'],
148
- 'check_hostname': self.ssl.get('check_hostname', False)
62
+ self.config = {
63
+ 'host': self.host,
64
+ 'port': int(self.port),
65
+ 'user': self.username,
66
+ 'password': self.password,
67
+ 'charset': charset, # utf8mb4 支持存储四字节的UTF-8字符集
68
+ 'cursorclass': pymysql.cursors.DictCursor,
149
69
  }
70
+ self.filename = None
150
71
 
151
- try:
152
- pool = PooledDB(**pool_params)
153
- return pool
154
- except Exception as e:
155
- if self.logger:
156
- self.logger.error("连接池创建失败: %s", str(e))
157
- raise ConnectionError(f"连接池创建失败: {str(e)}")
72
+ @staticmethod
73
+ def try_except(func): # 在类内部定义一个异常处理方法
158
74
 
159
- def _validate_datetime(self, value):
160
- formats = [
161
- '%Y-%m-%d %H:%M:%S',
162
- '%Y-%m-%d',
163
- '%Y/%m/%d %H:%M:%S',
164
- '%Y/%m/%d',
165
- '%Y%m%d',
166
- '%Y-%m-%dT%H:%M:%S', # ISO格式
167
- '%Y-%m-%d %H:%M:%S.%f' # 带毫秒
168
- ]
169
- for fmt in formats:
75
+ @wraps(func)
76
+ def wrapper(*args, **kwargs):
170
77
  try:
171
- return datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d %H:%M:%S')
172
- except ValueError:
173
- continue
174
- raise ValueError(f"无效的日期格式: {value}")
175
-
176
- def _validate_identifier(self, identifier: str) -> str:
177
- """
178
- 验证并清理数据库标识符(数据库名、表名、列名)
179
- 防止SQL注入和非法字符
180
-
181
- :param identifier: 要验证的标识符
182
- :return: 清理后的安全标识符
183
- :raises ValueError: 如果标识符无效
184
- """
185
- if not identifier or not isinstance(identifier, str):
186
- error_msg = f"无效的标识符: {identifier}"
187
- if self.logger:
188
- self.logger.error(error_msg)
189
- raise ValueError(error_msg)
190
-
191
- # 移除可能有害的字符,只保留字母、数字、下划线和美元符号
192
- cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '', identifier)
193
- if not cleaned:
194
- error_msg = f"无法清理异常标识符: {identifier}"
195
- if self.logger:
196
- self.logger.error(error_msg)
197
- raise ValueError(error_msg)
198
-
199
- # 检查是否为MySQL保留字
200
- mysql_keywords = {
201
- 'select', 'insert', 'update', 'delete', 'from', 'where', 'and', 'or',
202
- 'not', 'like', 'in', 'is', 'null', 'true', 'false', 'between'
203
- }
204
- if cleaned.lower() in mysql_keywords:
205
- if self.logger:
206
- self.logger.warning("存在MySQL保留字: %s", cleaned)
207
- return f"`{cleaned}`"
208
-
209
- return cleaned
78
+ return func(*args, **kwargs)
79
+ except Exception as e:
80
+ logger.error(f'{func.__name__}, {e}') # 将异常信息返回
210
81
 
211
- def _validate_value(self, value: Any, column_type: str) -> Any:
212
- """
213
- 验证并清理数据值,根据列类型进行适当转换
82
+ return wrapper
214
83
 
215
- :param value: 要验证的值
216
- :param column_type: 列的数据类型
217
- :return: 清理后的值
218
- :raises ValueError: 如果值转换失败
219
- """
220
- if value is None:
221
- return None
84
+ def keep_connect(self, _db_name, _config, max_try: int=10):
85
+ attempts = 1
86
+ while attempts <= max_try:
87
+ try:
88
+ connection = pymysql.connect(**_config) # 连接数据库
89
+ return connection
90
+ except Exception as e:
91
+ logger.error(f'{_db_name}: 连接失败,正在重试: {self.host}:{self.port} {attempts}/{max_try} {e}')
92
+ attempts += 1
93
+ time.sleep(30)
94
+ logger.error(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
95
+ return None
222
96
 
223
- try:
224
- column_type_lower = column_type.lower()
97
+ def cover_doc_dtypes(self, dict_data):
98
+ """ 清理字典键值 并转换数据类型 """
99
+ if not dict_data:
100
+ logger.info(f'mysql.py -> MysqlUpload -> cover_dict_dtypes -> 传入的字典不能为空')
101
+ return
102
+ __res_dict = {}
103
+ new_dict_data = {}
104
+ for k, v in dict_data.items():
105
+ k = str(k).lower()
106
+ k = re.sub(r'[()\-,,$&~^、 ()\"\'“”=·/。》《><!!`]', '_', k, re.IGNORECASE)
107
+ k = k.replace(')', '')
108
+ k = re.sub(r'_{2,}', '_', k)
109
+ k = re.sub(r'_+$', '', k)
110
+ result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
111
+ result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
112
+ result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
113
+ result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
225
114
 
226
- if 'int' in column_type_lower:
227
- return int(value) if value is not None else None
228
- elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
229
- return float(value) if value is not None else None
230
- elif '日期' in column_type_lower or 'time' in column_type_lower:
231
- if isinstance(value, (datetime.datetime, pd.Timestamp)):
232
- return value.strftime('%Y-%m-%d %H:%M:%S')
233
- elif isinstance(value, str):
234
- try:
235
- return self._validate_datetime(value) # 使用专门的日期验证方法
236
- except ValueError as e:
237
- raise ValueError(f"无效日期格式: {value} - {str(e)}")
238
- return str(value)
239
- elif 'char' in column_type_lower or 'text' in column_type_lower:
240
- # 防止SQL注入
241
- if isinstance(value, str):
242
- return value.replace('\\', '\\\\').replace("'", "\\'")
243
- return str(value)
244
- elif 'json' in column_type_lower:
245
- import json
246
- return json.dumps(value) if value is not None else None
115
+ date_type = otk.is_valid_date(v) # 判断日期时间
116
+ int_num = otk.is_integer(v) # 判断整数
117
+ count_int, count_float = count_decimal_places(v) # 判断小数,返回小数位数
118
+ if result1: # 京东sku/spu商品信息
119
+ __res_dict.update({k: 'varchar(100)'})
120
+ elif k == '日期':
121
+ __res_dict.update({k: 'DATE'})
122
+ elif k == '更新时间':
123
+ __res_dict.update({k: 'TIMESTAMP'})
124
+ elif result2: # 小数
125
+ __res_dict.update({k: 'decimal(10,4)'})
126
+ elif date_type == 1: # 纯日期
127
+ __res_dict.update({k: 'DATE'})
128
+ elif date_type == 2: # 日期+时间
129
+ __res_dict.update({k: 'DATETIME'})
130
+ elif int_num:
131
+ __res_dict.update({k: 'INT'})
132
+ elif count_float > 0:
133
+ if count_int + count_float > 10:
134
+ if count_float >= 6:
135
+ __res_dict.update({k: 'decimal(14,6)'})
136
+ else:
137
+ __res_dict.update({k: 'decimal(14,4)'})
138
+ elif count_float >= 6:
139
+ __res_dict.update({k: 'decimal(14,6)'})
140
+ elif count_float >= 4:
141
+ __res_dict.update({k: 'decimal(12,4)'})
142
+ else:
143
+ __res_dict.update({k: 'decimal(10,2)'})
247
144
  else:
248
- return value
249
- except (ValueError, TypeError) as e:
250
- error_msg = f"数据类型转换异常 {value} to type {column_type}: {str(e)}"
251
- if self.logger:
252
- self.logger.error(error_msg)
253
- raise ValueError(error_msg)
145
+ __res_dict.update({k: 'varchar(255)'})
146
+ new_dict_data.update({k: v})
147
+ __res_dict.update({'数据主体': 'longblob'})
148
+ return __res_dict, new_dict_data
254
149
 
255
- def _execute_with_retry(self, func, *args, **kwargs):
150
+ @try_except
151
+ def insert_many_dict(self, db_name, table_name, dict_data_list, icm_update=None, index_length=100, set_typ=None, allow_not_null=False, cut_data=None):
256
152
  """
257
- 带重试机制的SQL执行装饰器
258
-
259
- :param func: 要执行的函数
260
- :param args: 位置参数
261
- :param kwargs: 关键字参数
262
- :return: 函数执行结果
263
- :raises Exception: 如果所有重试都失败
153
+ 插入字典数据
154
+ dict_data: 字典
155
+ index_length: 索引长度
156
+ icm_update: 增量更正
157
+ set_typ: {}
158
+ allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
264
159
  """
160
+ if not self.config:
161
+ return
265
162
 
266
- @wraps(func)
267
- def wrapper(*args, **kwargs):
268
- last_exception = None
269
- for attempt in range(self.max_retries):
270
- try:
271
- result = func(*args, **kwargs)
272
- if attempt > 0 and self.logger:
273
- self.logger.info("Operation succeeded after %d retries", attempt)
274
- return result
275
- except (pymysql.OperationalError, pymysql.err.MySQLError) as e:
276
- last_exception = e
277
- if attempt < self.max_retries - 1:
278
- wait_time = self.retry_interval * (attempt + 1)
279
- if self.logger:
280
- self.logger.warning(
281
- "尝试 %d/%d 失败: %s. %d秒后重试...",
282
- attempt + 1, self.max_retries, str(e), wait_time
283
- )
284
- time.sleep(wait_time)
285
- # 尝试重新连接
286
- try:
287
- self.pool = self._create_connection_pool()
288
- except Exception as reconnect_error:
289
- if self.logger:
290
- self.logger.error("重连失败: %s", str(reconnect_error))
291
- continue
163
+ if not dict_data_list:
164
+ logger.info(f'dict_data_list 不能为空 ')
165
+ return
166
+ dict_data = dict_data_list[0]
167
+ if cut_data:
168
+ if '日期' in dict_data.keys():
169
+ try:
170
+ __y = pd.to_datetime(dict_data['日期']).strftime('%Y')
171
+ __y_m = pd.to_datetime(dict_data['日期']).strftime('%Y-%m')
172
+ if str(cut_data).lower() == 'year':
173
+ table_name = f'{table_name}_{__y}'
174
+ elif str(cut_data).lower() == 'month':
175
+ table_name = f'{table_name}_{__y_m}'
292
176
  else:
293
- if self.logger:
294
- self.logger.error(
295
- "Operation failed after %d attempts. Last error: %s",
296
- self.max_retries, str(e)
297
- )
298
- except pymysql.IntegrityError as e:
299
- # 完整性错误通常不需要重试
300
- if self.logger:
301
- self.logger.error("完整性约束错误: %s", str(e))
302
- raise e
177
+ logger.info(f'参数不正确,cut_data应为 year 或 month ')
303
178
  except Exception as e:
304
- last_exception = e
305
- if self.logger:
306
- self.logger.error("发生意外错误: %s", str(e))
307
- break
308
-
309
- raise last_exception if last_exception else Exception("发生未知错误")
310
-
311
- return wrapper(*args, **kwargs)
179
+ logger.error(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
312
180
 
313
- def _get_connection(self):
314
- """从连接池获取连接"""
315
- try:
316
- conn = self.pool.connection()
317
- if self.logger:
318
- self.logger.debug("成功获取数据库连接")
319
- return conn
320
- except Exception as e:
321
- if self.logger:
322
- self.logger.error("连接数据库失败: %s", str(e))
323
- raise ConnectionError(f"连接数据库失败: {str(e)}")
181
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
182
+ if not connection:
183
+ return
184
+ with connection.cursor() as cursor:
185
+ cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
186
+ database_exists = cursor.fetchone()
187
+ if not database_exists:
188
+ # 如果数据库不存在,则新建
189
+ sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
190
+ cursor.execute(sql)
191
+ connection.commit()
192
+ logger.info(f"创建Database: {db_name}")
324
193
 
325
- def _check_database_exists(self, db_name: str) -> bool:
326
- """检查数据库是否存在"""
327
- db_name = self._validate_identifier(db_name)
328
- sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
194
+ self.config.update({'database': db_name}) # 添加更新 config 字段
195
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
196
+ if not connection:
197
+ return
198
+ with connection.cursor() as cursor:
199
+ # 1. 查询表, 不存在则创建一个空表
200
+ sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
201
+ cursor.execute(sql, (table_name,))
202
+ if not cursor.fetchone():
203
+ sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
204
+ cursor.execute(sql)
205
+ logger.info(f'创建 mysql 表: {table_name}')
329
206
 
330
- try:
331
- with self._get_connection() as conn:
332
- with conn.cursor() as cursor:
333
- cursor.execute(sql, (db_name,))
334
- exists = bool(cursor.fetchone())
335
- if self.logger:
336
- self.logger.debug("数据库 %s 已存在: %s", db_name, exists)
337
- return exists
338
- except Exception as e:
339
- if self.logger:
340
- self.logger.error("检查数据库是否存在时出错: %s", str(e))
341
- raise
207
+ # 根据 dict_data 的值添加指定的数据类型
208
+ dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
209
+ if set_typ:
210
+ # 更新自定义的列数据类型
211
+ for k, v in dtypes.copy().items():
212
+ # 确保传进来的 set_typ 键存在于实际的 df 列才 update
213
+ [dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
342
214
 
343
- def _create_database(self, db_name: str):
344
- """创建数据库"""
345
- db_name = self._validate_identifier(db_name)
346
- sql = f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET {self.charset} COLLATE {self.collation}"
215
+ # 检查列
216
+ sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
217
+ cursor.execute(sql, (db_name, table_name))
218
+ col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()] # 已存在的所有列
219
+ col_not_exist = [col for col in dict_data.keys() if col not in col_exist] # 不存在的列
220
+ # 不存在则新建列
221
+ if col_not_exist: # 数据表中不存在的列
222
+ for col in col_not_exist:
223
+ # 创建列,需转义
224
+ if allow_not_null:
225
+ sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]};"
226
+ else:
227
+ sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
347
228
 
348
- try:
349
- with self._get_connection() as conn:
350
- with conn.cursor() as cursor:
351
229
  cursor.execute(sql)
352
- conn.commit()
353
- if self.logger:
354
- self.logger.info("数据库 %s 创建成功", db_name)
355
- except Exception as e:
356
- if self.logger:
357
- self.logger.error("无法创建数据库 %s: %s", db_name, str(e))
358
- conn.rollback()
359
- raise
230
+ logger.info(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
360
231
 
361
- def _check_table_exists(self, db_name: str, table_name: str) -> bool:
362
- """检查表是否存在"""
363
- db_name = self._validate_identifier(db_name)
364
- table_name = self._validate_identifier(table_name)
365
- sql = """
366
- SELECT TABLE_NAME
367
- FROM INFORMATION_SCHEMA.TABLES
368
- WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
369
- """
232
+ if col == '日期':
233
+ sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
234
+ logger.info(f"设置为索引: {col}({dtypes[col]})")
235
+ cursor.execute(sql)
370
236
 
371
- try:
372
- with self._get_connection() as conn:
373
- with conn.cursor() as cursor:
237
+ connection.commit() # 提交事务
238
+ """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
239
+ """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
240
+ # 处理插入的数据
241
+ for dict_data in dict_data_list:
242
+ dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
243
+ if icm_update:
244
+ """ 使用增量更新: 需确保 icm_update['主键'] 传进来的列组合是数据表中唯一,值不会发生变化且不会重复,否则可能产生覆盖 """
245
+ sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
374
246
  cursor.execute(sql, (db_name, table_name))
375
- exists = bool(cursor.fetchone())
376
- return exists
377
- except Exception as e:
378
- if self.logger:
379
- self.logger.error("检查数据表是否存在时发生未知错误: %s", str(e))
380
- raise
247
+ columns = cursor.fetchall()
248
+ cols_exist = [col['COLUMN_NAME'] for col in columns] # 数据表的所有列, 返回 list
249
+ # 保留原始列名,不提前转义
250
+ raw_update_col = [item for item in cols_exist if item not in icm_update and item != 'id'] # 除了主键外的其他列
381
251
 
382
- def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
383
- """获取表的列名和数据类型"""
384
- db_name = self._validate_identifier(db_name)
385
- table_name = self._validate_identifier(table_name)
386
- sql = """
387
- SELECT COLUMN_NAME, DATA_TYPE
388
- FROM INFORMATION_SCHEMA.COLUMNS
389
- WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
390
- ORDER BY ORDINAL_POSITION
391
- """
252
+ # 构建条件参数(使用原始列名)
253
+ condition_params = []
254
+ condition_parts = []
255
+ for up_col in icm_update:
256
+ condition_parts.append(f"`{up_col}` = %s") # SQL 转义
257
+ condition_params.append(dict_data[up_col]) # 原始列名用于访问数据
392
258
 
393
- try:
394
- with self._get_connection() as conn:
395
- with conn.cursor() as cursor:
396
- cursor.execute(sql, (db_name, table_name))
397
- columns = {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
398
- if self.logger:
399
- self.logger.debug("获取表 %s.%s 的列信息: %s", db_name, table_name, columns)
400
- return columns
401
- except Exception as e:
402
- if self.logger:
403
- self.logger.error("无法获取表列信息: %s", str(e))
404
- raise
259
+ # 动态转义列名生成 SQL 查询字段
260
+ escaped_update_col = [f'`{col}`' for col in raw_update_col]
261
+ sql = f"""SELECT {','.join(escaped_update_col)} FROM `{table_name}` WHERE {' AND '.join(condition_parts)}"""
262
+ cursor.execute(sql, condition_params)
263
+ results = cursor.fetchall()
405
264
 
406
- def _prepare_data(
407
- self,
408
- data: Union[Dict, List[Dict], pd.DataFrame],
409
- columns: Dict[str, str],
410
- allow_null: bool = False
411
- ) -> List[Dict]:
412
- """
413
- 准备要上传的数据,验证并转换数据类型
265
+ if results:
266
+ for result in results:
267
+ change_col = []
268
+ change_placeholders = []
269
+ set_params = []
270
+ for raw_col in raw_update_col:
271
+ # 使用原始列名访问数据
272
+ df_value = str(dict_data[raw_col])
273
+ mysql_value = str(result[raw_col])
414
274
 
415
- :param data: 输入数据
416
- :param columns: 列名和数据类型字典 {列名: 数据类型}
417
- :param allow_null: 是否允许空值
418
- :return: 准备好的数据列表
419
- :raises ValueError: 如果数据验证失败
420
- """
421
- # 统一数据格式为字典列表
422
- if isinstance(data, pd.DataFrame):
423
- try:
424
- data = data.replace({pd.NA: None}).to_dict('records')
425
- except Exception as e:
426
- if self.logger:
427
- self.logger.error("Failed to convert DataFrame to dict: %s", str(e))
428
- raise ValueError(f"Failed to convert DataFrame to dict: {str(e)}")
429
- elif isinstance(data, dict):
430
- data = [data]
431
- elif not isinstance(data, list) or not all(isinstance(item, dict) for item in data):
432
- error_msg = "Data must be a dict, list of dicts, or DataFrame"
433
- if self.logger:
434
- self.logger.error(error_msg)
435
- raise ValueError(error_msg)
275
+ # 清理小数点后多余的零
276
+ if '.' in df_value:
277
+ df_value = re.sub(r'0+$', '', df_value).rstrip('.')
278
+ if '.' in mysql_value:
279
+ mysql_value = re.sub(r'0+$', '', mysql_value).rstrip('.')
436
280
 
437
- prepared_data = []
438
- for row_idx, row in enumerate(data, 1):
439
- prepared_row = {}
440
- for col_name, col_type in columns.items():
441
- # 跳过id列,不允许外部传入id
442
- if col_name.lower() == 'id':
281
+ if df_value != mysql_value:
282
+ change_placeholders.append(f"`{raw_col}` = %s") # 动态转义列名
283
+ set_params.append(dict_data[raw_col])
284
+ change_col.append(raw_col)
285
+
286
+ if change_placeholders:
287
+ full_params = set_params + condition_params
288
+ sql = f"""UPDATE `{table_name}`
289
+ SET {','.join(change_placeholders)}
290
+ WHERE {' AND '.join(condition_parts)}"""
291
+ cursor.execute(sql, full_params)
292
+ else: # 没有数据返回,则直接插入数据
293
+ # 参数化插入
294
+ cols = ', '.join([f'`{k}`' for k in dict_data.keys()])
295
+ placeholders = ', '.join(['%s'] * len(dict_data))
296
+ sql = f"INSERT INTO `{table_name}` ({cols}) VALUES ({placeholders})"
297
+ cursor.execute(sql, tuple(dict_data.values()))
298
+ connection.commit() # 提交数据库
443
299
  continue
444
300
 
445
- if col_name not in row:
446
- if not allow_null:
447
- error_msg = f"Row {row_idx}: Missing required column '{col_name}' in data"
448
- if self.logger:
449
- self.logger.error(error_msg)
450
- raise ValueError(error_msg)
451
- prepared_row[col_name] = None
452
- else:
453
- try:
454
- prepared_row[col_name] = self._validate_value(row[col_name], col_type)
455
- except ValueError as e:
456
- error_msg = f"Row {row_idx}, column '{col_name}': {str(e)}"
457
- if self.logger:
458
- self.logger.error(error_msg)
459
- raise ValueError(error_msg)
460
- prepared_data.append(prepared_row)
301
+ # 标准插入逻辑(参数化修改)
302
+ # 构造更新列(排除主键)
303
+ update_cols = [k for k in dict_data.keys()]
304
+ # 构建SQL
305
+ cols = ', '.join([f'`{k}`' for k in dict_data.keys()])
306
+ placeholders = ', '.join(['%s'] * len(dict_data))
307
+ update_clause = ', '.join([f'`{k}` = VALUES(`{k}`)' for k in update_cols]) or 'id=id'
461
308
 
462
- if self.logger:
463
- self.logger.debug("已准备 %d 行数据", len(prepared_data))
464
- return prepared_data
309
+ sql = f"""INSERT INTO `{table_name}` ({cols}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause}"""
310
+ # 执行参数化查询
311
+ try:
312
+ cursor.execute(sql, tuple(dict_data.values()))
313
+ connection.commit()
314
+ except pymysql.Error as e:
315
+ logger.error(f"插入失败: {e}\nSQL: {cursor.mogrify(sql, tuple(dict_data.values()))}")
316
+ connection.rollback()
317
+ connection.close()
465
318
 
466
- def _get_partition_table_name(self, table_name: str, date_value: str, partition_by: str) -> str:
319
+ # @try_except
320
+ def dict_to_mysql(self, db_name, table_name, dict_data, icm_update=None, index_length=100, set_typ=None, allow_not_null=False, cut_data=None):
467
321
  """
468
- 获取分表名称
469
-
470
- :param table_name: 基础表名
471
- :param date_value: 日期值
472
- :param partition_by: 分表方式 ('year' 或 'month')
473
- :return: 分表名称
474
- :raises ValueError: 如果日期格式无效或分表方式无效
475
- """
476
- try:
477
- date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d %H:%M:%S')
478
- except ValueError:
479
- try:
480
- date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d')
481
- except ValueError:
482
- error_msg = f"无效的日期格式: {date_value}"
483
- if self.logger:
484
- self.logger.error("无效的日期格式: %s", date_value)
485
- raise ValueError(error_msg)
486
-
487
- if partition_by == 'year':
488
- return f"{table_name}_{date_obj.year}"
489
- elif partition_by == 'month':
490
- return f"{table_name}_{date_obj.year}_{date_obj.month:02d}"
491
- else:
492
- error_msg = "partition_by must be 'year' or 'month'"
493
- if self.logger:
494
- self.logger.error(error_msg)
495
- raise ValueError(error_msg)
496
-
497
- def _create_table(
498
- self,
499
- db_name: str,
500
- table_name: str,
501
- columns: Dict[str, str],
502
- primary_keys: Optional[List[str]] = None,
503
- date_column: Optional[str] = None,
504
- indexes: Optional[List[str]] = None,
505
- unique_columns: Optional[List[str]] = None
506
- ):
507
- """
508
- 创建数据表
509
-
510
- :param db_name: 数据库名
511
- :param table_name: 表名
512
- :param columns: 列名和数据类型字典 {列名: 数据类型}
513
- :param primary_keys: 主键列列表
514
- :param date_column: 日期列名,如果存在将设置为索引
515
- :param indexes: 需要创建索引的列列表
516
- :param unique_columns: 需要创建唯一索引的列列表
322
+ 插入字典数据
323
+ dict_data: 字典
324
+ index_length: 索引长度
325
+ icm_update: 增量更新
326
+ set_typ: {}
327
+ allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
517
328
  """
518
- db_name = self._validate_identifier(db_name)
519
- table_name = self._validate_identifier(table_name)
329
+ if not self.config:
330
+ return
520
331
 
521
- if not columns:
522
- error_msg = "No columns specified for table creation"
523
- if self.logger:
524
- self.logger.error(error_msg)
525
- raise ValueError(error_msg)
332
+ if cut_data:
333
+ if '日期' in dict_data.keys():
334
+ try:
335
+ __y = pd.to_datetime(dict_data['日期']).strftime('%Y')
336
+ __y_m = pd.to_datetime(dict_data['日期']).strftime('%Y-%m')
337
+ if str(cut_data).lower() == 'year':
338
+ table_name = f'{table_name}_{__y}'
339
+ elif str(cut_data).lower() == 'month':
340
+ table_name = f'{table_name}_{__y_m}'
341
+ else:
342
+ logger.info(f'参数不正确,cut_data应为 year 或 month ')
343
+ except Exception as e:
344
+ logger.error(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
526
345
 
527
- # 构建列定义SQL
528
- column_defs = ["`id` INT NOT NULL AUTO_INCREMENT"]
346
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
347
+ if not connection:
348
+ return
349
+ with connection.cursor() as cursor:
350
+ cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
351
+ database_exists = cursor.fetchone()
352
+ if not database_exists:
353
+ # 如果数据库不存在,则新建
354
+ sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
355
+ cursor.execute(sql)
356
+ connection.commit()
357
+ logger.info(f"创建Database: {db_name}")
529
358
 
530
- # 添加其他列定义
531
- for col_name, col_type in columns.items():
532
- # 跳过id列,因为已经在前面添加了
533
- if col_name.lower() == 'id':
534
- continue
535
- safe_col_name = self._validate_identifier(col_name)
536
- col_def = f"`{safe_col_name}` {col_type}"
359
+ self.config.update({'database': db_name}) # 添加更新 config 字段
360
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
361
+ if not connection:
362
+ return
363
+ with connection.cursor() as cursor:
364
+ # 1. 查询表, 不存在则创建一个空表
365
+ sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
366
+ cursor.execute(sql, (table_name,))
367
+ if not cursor.fetchone():
368
+ sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
369
+ cursor.execute(sql)
370
+ logger.info(f'创建 mysql 表: {table_name}')
537
371
 
538
- # 添加NOT NULL约束
539
- if not col_type.lower().startswith('json'):
540
- col_def += " NOT NULL"
372
+ # 根据 dict_data 的值添加指定的数据类型
373
+ dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
374
+ if set_typ:
375
+ # 更新自定义的列数据类型
376
+ for k, v in dtypes.copy().items():
377
+ # 确保传进来的 set_typ 键存在于实际的 df 列才 update
378
+ [dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
541
379
 
542
- column_defs.append(col_def)
380
+ # 检查列
381
+ sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
382
+ cursor.execute(sql, (db_name, table_name))
383
+ col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()] # 已存在的所有列
384
+ col_not_exist = [col for col in dict_data.keys() if col not in col_exist] # 不存在的列
385
+ # 不存在则新建列
386
+ if col_not_exist: # 数据表中不存在的列
387
+ for col in col_not_exist:
388
+ # 创建列,需转义
389
+ if allow_not_null:
390
+ sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]};"
391
+ else:
392
+ sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
393
+ cursor.execute(sql)
394
+ logger.info(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
543
395
 
544
- # 添加主键定义
545
- if primary_keys:
546
- # 确保id在主键中
547
- if 'id' not in [pk.lower() for pk in primary_keys]:
548
- primary_keys = ['id'] + primary_keys
549
- else:
550
- # 如果没有指定主键,则使用id作为主键
551
- primary_keys = ['id']
396
+ if col == '日期':
397
+ sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
398
+ logger.info(f"设置为索引: {col}({dtypes[col]})")
399
+ cursor.execute(sql)
400
+ connection.commit() # 提交事务
401
+ """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
402
+ """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
403
+ # 处理插入的数据
404
+ if icm_update:
405
+ """ 使用增量更新: 需确保 icm_update['主键'] 传进来的列组合是数据表中唯一,值不会发生变化且不会重复,否则可能产生覆盖 """
406
+ sql = """SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s"""
407
+ cursor.execute(sql, (db_name, table_name))
408
+ cols_exist = [col['COLUMN_NAME'] for col in cursor.fetchall()] # 数据表的所有列, 返回 list
552
409
 
553
- # 添加主键定义
554
- safe_primary_keys = [self._validate_identifier(pk) for pk in primary_keys]
555
- primary_key_sql = f", PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
410
+ # 保留原始列名,不提前转义
411
+ raw_update_col = [item for item in cols_exist if item not in icm_update and item != 'id']
556
412
 
557
- # 添加唯一索引定义
558
- unique_index_sql = ""
559
- if unique_columns:
560
- for col in unique_columns:
561
- if col.lower() != 'id' and col in columns:
562
- safe_col = self._validate_identifier(col)
563
- unique_index_sql += f", UNIQUE KEY `uk_{safe_col}` (`{safe_col}`)"
413
+ # 构建条件参数(使用原始列名)
414
+ condition_params = []
415
+ condition_parts = []
416
+ for up_col in icm_update:
417
+ condition_parts.append(f"`{up_col}` = %s") # SQL 转义
418
+ condition_params.append(dict_data[up_col]) # 原始列名访问数据
564
419
 
565
- # 构建完整SQL
566
- sql = f"""
567
- CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
568
- {','.join(column_defs)}
569
- {primary_key_sql}
570
- {unique_index_sql}
571
- ) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
572
- """
420
+ # 动态转义列名生成 SQL 查询字段
421
+ escaped_update_col = [f'`{col}`' for col in raw_update_col]
422
+ sql = f"""SELECT {','.join(escaped_update_col)} FROM `{table_name}` WHERE {' AND '.join(condition_parts)}"""
423
+ cursor.execute(sql, condition_params)
424
+ results = cursor.fetchall()
573
425
 
574
- try:
575
- with self._get_connection() as conn:
576
- with conn.cursor() as cursor:
577
- cursor.execute(sql)
578
- if self.logger:
579
- self.logger.info("表 %s.%s 创建成功", db_name, table_name)
426
+ if results:
427
+ for result in results:
428
+ change_col = []
429
+ change_placeholders = []
430
+ set_params = []
431
+ for raw_col in raw_update_col:
432
+ # 使用原始列名访问数据
433
+ df_value = str(dict_data[raw_col])
434
+ mysql_value = str(result[raw_col])
580
435
 
581
- # 添加普通索引
582
- index_statements = []
436
+ # 清理小数点后多余的零
437
+ if '.' in df_value:
438
+ df_value = re.sub(r'0+$', '', df_value).rstrip('.')
439
+ if '.' in mysql_value:
440
+ mysql_value = re.sub(r'0+$', '', mysql_value).rstrip('.')
583
441
 
584
- # 日期列索引
585
- if date_column and date_column in columns:
586
- safe_date_col = self._validate_identifier(date_column)
587
- index_statements.append(
588
- f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_date_col}` (`{safe_date_col}`)"
589
- )
442
+ if df_value != mysql_value:
443
+ change_placeholders.append(f"`{raw_col}` = %s") # 动态转义列名
444
+ set_params.append(dict_data[raw_col])
445
+ change_col.append(raw_col)
590
446
 
591
- # 其他索引
592
- if indexes:
593
- for idx_col in indexes:
594
- if idx_col in columns:
595
- safe_idx_col = self._validate_identifier(idx_col)
596
- index_statements.append(
597
- f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)"
598
- )
447
+ if change_placeholders:
448
+ full_params = set_params + condition_params
449
+ sql = f"""UPDATE `{table_name}`
450
+ SET {','.join(change_placeholders)}
451
+ WHERE {' AND '.join(condition_parts)}"""
452
+ cursor.execute(sql, full_params)
453
+ else: # 没有数据返回,则直接插入数据
454
+ # 参数化插入语句
455
+ keys = [f"`{k}`" for k in dict_data.keys()]
456
+ placeholders = ','.join(['%s'] * len(dict_data))
457
+ update_clause = ','.join([f"`{k}`=VALUES(`{k}`)" for k in dict_data.keys()])
458
+ sql = f"""INSERT INTO `{table_name}` ({','.join(keys)}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause}"""
459
+ cursor.execute(sql, tuple(dict_data.values()))
460
+ connection.commit() # 提交数据库
461
+ connection.close()
462
+ return
599
463
 
600
- # 执行所有索引创建语句
601
- if index_statements:
602
- with conn.cursor() as cursor:
603
- for stmt in index_statements:
604
- cursor.execute(stmt)
605
- if self.logger:
606
- self.logger.debug("Executed index statement: %s", stmt)
464
+ # 常规插入处理(参数化)
465
+ keys = [f"`{k}`" for k in dict_data.keys()]
466
+ placeholders = ','.join(['%s'] * len(dict_data))
467
+ update_clause = ','.join([f"`{k}`=VALUES(`{k}`)" for k in dict_data.keys()])
468
+ sql = f"""INSERT INTO `{table_name}` ({','.join(keys)}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause}"""
469
+ cursor.execute(sql, tuple(dict_data.values()))
470
+ connection.commit()
471
+ connection.close()
607
472
 
608
- conn.commit()
609
- if self.logger:
610
- self.logger.info("All indexes created successfully for %s.%s", db_name, table_name)
473
+ def cover_dict_dtypes(self, dict_data):
474
+ """ 清理字典键值 并转换数据类型 """
475
+ if not dict_data:
476
+ logger.info(f'mysql.py -> MysqlUpload -> cover_dict_dtypes -> 传入的字典不能为空')
477
+ return
478
+ __res_dict = {}
479
+ new_dict_data = {}
480
+ for k, v in dict_data.items():
481
+ k = str(k).lower()
482
+ k = re.sub(r'[()\-,,$&~^、 ()\"\'“”=·/。》《><!!`]', '_', k, re.IGNORECASE)
483
+ k = k.replace(')', '')
484
+ k = re.sub(r'_{2,}', '_', k)
485
+ k = re.sub(r'_+$', '', k)
486
+ if str(v) == '':
487
+ v = 0
488
+ v = str(v)
489
+ v = re.sub('^="|"$', '', v, re.I)
490
+ v = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', str(v)) # 移除控制字符
491
+ if re.findall(r'^[-+]?\d+\.?\d*%$', v):
492
+ v = str(float(v.rstrip("%")) / 100)
611
493
 
612
- except Exception as e:
613
- if self.logger:
614
- self.logger.error("创建表 %s.%s 失败: %s", db_name, table_name, str(e))
615
- conn.rollback()
616
- raise
617
-
618
- def upload_data(
619
- self,
620
- db_name: str,
621
- table_name: str,
622
- data: Union[Dict, List[Dict], pd.DataFrame],
623
- columns: Dict[str, str],
624
- primary_keys: Optional[List[str]] = None,
625
- check_duplicate: bool = False,
626
- duplicate_columns: Optional[List[str]] = None,
627
- allow_null: bool = False,
628
- partition_by: Optional[str] = None,
629
- partition_date_column: str = '日期',
630
- auto_create: bool = True,
631
- replace: bool = False,
632
- indexes: Optional[List[str]] = None
633
- ):
634
- """
635
- 上传数据到数据库
636
-
637
- :param db_name: 数据库名
638
- :param table_name: 表名
639
- :param data: 要上传的数据
640
- :param columns: 列名和数据类型字典 {列名: 数据类型}
641
- :param primary_keys: 主键列列表
642
- :param check_duplicate: 是否检查重复,默认为False
643
- :param duplicate_columns: 用于检查重复的列列表,如果不指定则使用所有列
644
- :param allow_null: 是否允许空值,默认为False
645
- :param partition_by: 分表方式 ('year' 或 'month'),默认为None不分表
646
- :param partition_date_column: 用于分表的日期列名,默认为'date'
647
- :param auto_create: 是否自动创建不存在的数据库或表,默认为True
648
- :param replace: 是否使用REPLACE代替INSERT,默认为False
649
- :param indexes: 需要创建索引的列列表
650
- :raises ValueError: 如果参数无效或操作失败
651
- """
652
- if self.logger:
653
- self.logger.info(
654
- "开始上传数据到 %s.%s (分表方式=%s, 替换模式=%s)",
655
- db_name, table_name, partition_by, replace
656
- )
657
-
658
- # 验证参数
659
- if not columns:
660
- error_msg = "Columns specification is required"
661
- if self.logger:
662
- self.logger.error(error_msg)
663
- raise ValueError(error_msg)
664
-
665
- if partition_by and partition_by not in ['year', 'month']:
666
- error_msg = "分表方式必须是 'year' 或 'month'"
667
- if self.logger:
668
- self.logger.error(error_msg)
669
- raise ValueError(error_msg)
670
-
671
- # 准备数据
672
- prepared_data = self._prepare_data(data, columns, allow_null)
673
-
674
- # 检查数据库是否存在
675
- if not self._check_database_exists(db_name):
676
- if auto_create:
677
- self._create_database(db_name)
678
- else:
679
- error_msg = f"Database '{db_name}' does not exist"
680
- if self.logger:
681
- self.logger.error(error_msg)
682
- raise ValueError(error_msg)
683
-
684
- # 确定唯一索引列
685
- unique_columns = None
686
- if check_duplicate:
687
- unique_columns = duplicate_columns if duplicate_columns else [col for col in columns.keys() if
688
- col.lower() != 'id']
689
-
690
- # 处理分表逻辑
691
- if partition_by:
692
- # 分组数据按分表
693
- partitioned_data = {}
694
- for row in prepared_data:
695
- if partition_date_column not in row:
696
- error_msg = f"异常缺失列 '{partition_date_column}'"
697
- if self.logger:
698
- self.logger.error(error_msg)
699
- raise ValueError(error_msg)
700
- part_table = self._get_partition_table_name(table_name, str(row[partition_date_column]), partition_by)
701
- if part_table not in partitioned_data:
702
- partitioned_data[part_table] = []
703
- partitioned_data[part_table].append(row)
704
-
705
- # 对每个分表执行上传
706
- for part_table, part_data in partitioned_data.items():
707
- self._upload_to_table(
708
- db_name, part_table, part_data, columns,
709
- primary_keys, check_duplicate, duplicate_columns,
710
- allow_null, auto_create, partition_date_column,
711
- replace, indexes, unique_columns
712
- )
713
- else:
714
- # 不分表,直接上传
715
- self._upload_to_table(
716
- db_name, table_name, prepared_data, columns,
717
- primary_keys, check_duplicate, duplicate_columns,
718
- allow_null, auto_create, partition_date_column,
719
- replace, indexes, unique_columns
720
- )
721
-
722
- if self.logger:
723
- self.logger.info(
724
- "成功上传 %d 行数据到 %s.%s",
725
- len(prepared_data), db_name, table_name
726
- )
727
-
728
- def _upload_to_table(
729
- self,
730
- db_name: str,
731
- table_name: str,
732
- data: List[Dict],
733
- columns: Dict[str, str],
734
- primary_keys: Optional[List[str]],
735
- check_duplicate: bool,
736
- duplicate_columns: Optional[List[str]],
737
- allow_null: bool,
738
- auto_create: bool,
739
- date_column: Optional[str],
740
- replace: bool,
741
- indexes: Optional[List[str]],
742
- unique_columns: Optional[List[str]] = None
743
- ):
744
- """实际执行表上传的内部方法"""
745
- # 检查表是否存在
746
- if not self._check_table_exists(db_name, table_name):
747
- if auto_create:
748
- self._create_table(db_name, table_name, columns, primary_keys, date_column, indexes, unique_columns)
749
- else:
750
- error_msg = f"Table '{db_name}.{table_name}' does not exist"
751
- if self.logger:
752
- self.logger.error(error_msg)
753
- raise ValueError(error_msg)
754
-
755
- # 获取表结构并验证
756
- table_columns = self._get_table_columns(db_name, table_name)
757
- if not table_columns:
758
- error_msg = f"Failed to get columns for table '{db_name}.{table_name}'"
759
- if self.logger:
760
- self.logger.error(error_msg)
761
- raise ValueError(error_msg)
762
-
763
- # 验证数据列与表列匹配
764
- for col in columns:
765
- if col not in table_columns:
766
- error_msg = f"Column '{col}' not found in table '{db_name}.{table_name}'"
767
- if self.logger:
768
- self.logger.error(error_msg)
769
- raise ValueError(error_msg)
770
-
771
- # 插入数据
772
- self._insert_data(
773
- db_name, table_name, data, columns,
774
- check_duplicate, duplicate_columns,
775
- replace=replace
776
- )
777
-
778
- def _insert_data(
779
- self,
780
- db_name: str,
781
- table_name: str,
782
- data: List[Dict],
783
- columns: Dict[str, str],
784
- check_duplicate: bool = False,
785
- duplicate_columns: Optional[List[str]] = None,
786
- batch_size: int = 1000,
787
- replace: bool = False
788
- ):
789
- """
790
- 插入数据到表中
791
-
792
- :param db_name: 数据库名
793
- :param table_name: 表名
794
- :param data: 要插入的数据
795
- :param columns: 列名和数据类型字典
796
- :param check_duplicate: 是否检查重复
797
- :param duplicate_columns: 用于检查重复的列列表
798
- :param batch_size: 批量插入的大小
799
- :param replace: 是否使用REPLACE代替INSERT
800
- :raises Exception: 如果插入失败
801
- """
802
- db_name = self._validate_identifier(db_name)
803
- table_name = self._validate_identifier(table_name)
804
-
805
- if not data:
806
- if self.logger:
807
- self.logger.warning("No data to insert into %s.%s", db_name, table_name)
808
- return
809
-
810
- # 获取所有列名
811
- all_columns = [col for col in columns.keys() if col.lower() != 'id']
812
- safe_columns = [self._validate_identifier(col) for col in all_columns]
813
- placeholders = ','.join(['%s'] * len(safe_columns))
814
-
815
- # 构建SQL语句
816
- operation = "REPLACE" if replace else "INSERT IGNORE" if check_duplicate else "INSERT"
817
-
818
- if check_duplicate and not replace:
819
- # 当check_duplicate=True时,使用INSERT IGNORE来跳过重复记录
820
- sql = f"""
821
- {operation} INTO `{db_name}`.`{table_name}`
822
- (`{'`,`'.join(safe_columns)}`)
823
- VALUES ({placeholders})
824
- """
825
- else:
826
- sql = f"""
827
- {operation} INTO `{db_name}`.`{table_name}`
828
- (`{'`,`'.join(safe_columns)}`)
829
- VALUES ({placeholders})
830
- """
831
-
832
- if len(self._prepared_statements) >= self._max_cached_statements:
833
- # 移除最旧的缓存
834
- oldest_key = next(iter(self._prepared_statements))
835
- del self._prepared_statements[oldest_key]
836
-
837
- # 缓存预处理语句
838
- cache_key = f"{db_name}.{table_name}.{operation}.{check_duplicate}"
839
- if cache_key not in self._prepared_statements:
840
- self._prepared_statements[cache_key] = sql
841
- if self.logger:
842
- self.logger.debug("已缓存预处理语句: %s", cache_key)
843
-
844
- # 分批插入数据
845
- with self._get_connection() as conn:
846
- with conn.cursor() as cursor:
847
- for i in range(0, len(data), batch_size):
848
- batch = data[i:i + batch_size]
849
- # 准备批量数据
850
- values = []
851
- for row in batch:
852
- row_values = []
853
- for col in all_columns:
854
- row_values.append(row.get(col))
855
- values.append(row_values)
856
-
857
- # 执行批量插入
858
- try:
859
- start_time = time.time()
860
- cursor.executemany(sql, values)
861
- conn.commit() # 每个批次提交一次
862
- if self.logger:
863
- self.logger.debug(
864
- "成功插入批次 %d-%d/%d 到 %s.%s, 耗时 %.2f 秒",
865
- i + 1, min(i + batch_size, len(data)), len(data),
866
- db_name, table_name, time.time() - start_time
867
- )
868
- except Exception as e:
869
- conn.rollback()
870
- error_msg = f"Failed to insert batch {i + 1}-{min(i + batch_size, len(data))}/{len(data)} into {db_name}.{table_name}: {str(e)}"
871
- if self.logger:
872
- self.logger.error(error_msg)
873
- raise Exception(error_msg)
874
-
875
- def close(self):
876
- """关闭连接池"""
877
- if hasattr(self, 'pool') and self.pool:
878
- try:
879
- # 先关闭所有连接
880
- while True:
881
- conn = getattr(self.pool, '_connections', None)
882
- if not conn or not conn.queue:
883
- break
884
- try:
885
- conn = self.pool.connection()
886
- conn.close()
887
- except:
888
- pass
889
-
890
- # 然后关闭连接池
891
- self.pool.close()
892
- if self.logger:
893
- self.logger.info("连接池已成功关闭")
894
- except Exception as e:
895
- if self.logger:
896
- self.logger.error("关闭连接池失败: %s", str(e))
897
- raise
898
- self.pool = None
899
-
900
- def __enter__(self):
901
- return self
902
-
903
- def __exit__(self, exc_type, exc_val, exc_tb):
904
- self.close()
905
- if exc_type is not None and self.logger:
906
- self.logger.error(
907
- "Exception occurred: %s: %s",
908
- exc_type.__name__, str(exc_val),
909
- exc_info=(exc_type, exc_val, exc_tb)
910
- )
911
-
912
-
913
- class MysqlUpload:
914
- def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
915
- self.username = username
916
- self.password = password
917
- self.host = host
918
- self.port = port
919
- if username == '' or password == '' or host == '' or port == 0:
920
- self.config = None
921
- else:
922
- self.config = {
923
- 'host': self.host,
924
- 'port': int(self.port),
925
- 'user': self.username,
926
- 'password': self.password,
927
- 'charset': charset, # utf8mb4 支持存储四字节的UTF-8字符集
928
- 'cursorclass': pymysql.cursors.DictCursor,
929
- }
930
- self.filename = None
931
-
932
- @staticmethod
933
- def try_except(func): # 在类内部定义一个异常处理方法
934
-
935
- @wraps(func)
936
- def wrapper(*args, **kwargs):
937
- try:
938
- return func(*args, **kwargs)
939
- except Exception as e:
940
- logger.error(f'{func.__name__}, {e}') # 将异常信息返回
941
-
942
- return wrapper
943
-
944
- def keep_connect(self, _db_name, _config, max_try: int=10):
945
- attempts = 1
946
- while attempts <= max_try:
947
- try:
948
- connection = pymysql.connect(**_config) # 连接数据库
949
- return connection
950
- except Exception as e:
951
- logger.error(f'{_db_name}: 连接失败,正在重试: {self.host}:{self.port} {attempts}/{max_try} {e}')
952
- attempts += 1
953
- time.sleep(30)
954
- logger.error(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
955
- return None
956
-
957
- def cover_doc_dtypes(self, dict_data):
958
- """ 清理字典键值 并转换数据类型 """
959
- if not dict_data:
960
- logger.info(f'mysql.py -> MysqlUpload -> cover_dict_dtypes -> 传入的字典不能为空')
961
- return
962
- __res_dict = {}
963
- new_dict_data = {}
964
- for k, v in dict_data.items():
965
- k = str(k).lower()
966
- k = re.sub(r'[()\-,,$&~^、 ()\"\'“”=·/。》《><!!`]', '_', k, re.IGNORECASE)
967
- k = k.replace(')', '')
968
- k = re.sub(r'_{2,}', '_', k)
969
- k = re.sub(r'_+$', '', k)
970
- result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
971
- result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
972
- result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
973
- result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
494
+ result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
495
+ result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
496
+ result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
497
+ result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
974
498
 
975
499
  date_type = otk.is_valid_date(v) # 判断日期时间
976
500
  int_num = otk.is_integer(v) # 判断整数
@@ -991,6 +515,8 @@ class MysqlUpload:
991
515
  __res_dict.update({k: 'INT'})
992
516
  elif count_float > 0:
993
517
  if count_int + count_float > 10:
518
+ # if count_float > 5:
519
+ # v = round(float(v), 4)
994
520
  if count_float >= 6:
995
521
  __res_dict.update({k: 'decimal(14,6)'})
996
522
  else:
@@ -1004,45 +530,110 @@ class MysqlUpload:
1004
530
  else:
1005
531
  __res_dict.update({k: 'varchar(255)'})
1006
532
  new_dict_data.update({k: v})
1007
- __res_dict.update({'数据主体': 'longblob'})
1008
533
  return __res_dict, new_dict_data
1009
534
 
535
+ def convert_df_dtypes(self, df: pd.DataFrame):
536
+ """ 清理 df 的值和列名,并转换数据类型 """
537
+ df = otk.cover_df(df=df) # 清理 df 的值和列名
538
+ [pd.to_numeric(df[col], errors='ignore') for col in df.columns.tolist()]
539
+ dtypes = df.dtypes.to_dict()
540
+ __res_dict = {}
541
+ for k, v in dtypes.copy().items():
542
+ result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
543
+ result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
544
+ result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
545
+ result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
546
+
547
+ if result1: # id/sku/spu商品信息
548
+ __res_dict.update({k: 'varchar(50)'})
549
+ elif result2: # 小数
550
+ __res_dict.update({k: 'decimal(10,4)'})
551
+ elif result3: # 小数
552
+ __res_dict.update({k: 'decimal(12,4)'})
553
+ elif result4: # 小数
554
+ __res_dict.update({k: 'decimal(12,2)'})
555
+ elif k == '日期':
556
+ __res_dict.update({k: 'date'})
557
+ elif k == '更新时间':
558
+ __res_dict.update({k: 'timestamp'})
559
+ elif v == 'int64':
560
+ __res_dict.update({k: 'int'})
561
+ elif v == 'float64':
562
+ __res_dict.update({k: 'decimal(10,4)'})
563
+ elif v == 'bool':
564
+ __res_dict.update({k: 'boolean'})
565
+ elif v == 'datetime64[ns]':
566
+ __res_dict.update({k: 'datetime'})
567
+ else:
568
+ __res_dict.update({k: 'varchar(255)'})
569
+ return __res_dict, df
570
+
1010
571
  @try_except
1011
- def insert_many_dict(self, db_name, table_name, dict_data_list, icm_update=None, index_length=100, set_typ=None, allow_not_null=False, cut_data=None):
572
+ def df_to_mysql(self, df, db_name, table_name, set_typ=None, icm_update=[], move_insert=False, df_sql=False,
573
+ filename=None, count=None, allow_not_null=False, cut_data=None):
1012
574
  """
1013
- 插入字典数据
1014
- dict_data: 字典
1015
- index_length: 索引长度
1016
- icm_update: 增量更正
1017
- set_typ: {}
575
+ db_name: 数据库名
576
+ table_name: 表名
577
+ move_insert: 根据df 的日期,先移除数据库数据,再插入, df_sql, icm_update 都要设置为 False
578
+ 原则上只限于聚合数据使用,原始数据插入时不要设置
579
+ df_sql: 这是一个临时参数, 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重,初创表大量上传数据的时候使用
580
+ icm_update: 增量更新, 在聚合数据中使用,原始文件不要使用
581
+ 使用增量更新: 必须确保 icm_update 传进来的列必须是数据表中唯一主键,值不会发生变化,不会重复,否则可能产生错乱覆盖情况
582
+ filename: 用来追踪处理进度,传这个参数是方便定位产生错误的文件
1018
583
  allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
1019
584
  """
1020
585
  if not self.config:
1021
586
  return
587
+ if icm_update:
588
+ if move_insert or df_sql:
589
+ logger.info(f'icm_update/move_insert/df_sql 参数不能同时设定')
590
+ return
591
+ if move_insert:
592
+ if icm_update or df_sql:
593
+ logger.info(f'icm_update/move_insert/df_sql 参数不能同时设定')
594
+ return
1022
595
 
1023
- if not dict_data_list:
1024
- logger.info(f'dict_data_list 不能为空 ')
596
+ self.filename = filename
597
+ if isinstance(df, pd.DataFrame):
598
+ if len(df) == 0:
599
+ logger.info(f'{db_name}: {table_name} 传入的 df 数据长度为0, {self.filename}')
600
+ return
601
+ else:
602
+ logger.info(f'{db_name}: {table_name} 传入的 df 不是有效的 dataframe 结构, {self.filename}')
1025
603
  return
1026
- dict_data = dict_data_list[0]
604
+ if not db_name or db_name == 'None':
605
+ logger.info(f'{db_name} 不能为 None')
606
+ return
607
+
1027
608
  if cut_data:
1028
- if '日期' in dict_data.keys():
609
+ if '日期' in df.columns.tolist():
1029
610
  try:
1030
- __y = pd.to_datetime(dict_data['日期']).strftime('%Y')
1031
- __y_m = pd.to_datetime(dict_data['日期']).strftime('%Y-%m')
611
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
612
+ min_year = df['日期'].min(skipna=True).year
613
+ min_month = df['日期'].min(skipna=True).month
614
+ if 0 < int(min_month) < 10 and not str(min_month).startswith('0'):
615
+ min_month = f'0{min_month}'
1032
616
  if str(cut_data).lower() == 'year':
1033
- table_name = f'{table_name}_{__y}'
617
+ table_name = f'{table_name}_{min_year}'
1034
618
  elif str(cut_data).lower() == 'month':
1035
- table_name = f'{table_name}_{__y_m}'
619
+ table_name = f'{table_name}_{min_year}-{min_month}'
1036
620
  else:
1037
621
  logger.info(f'参数不正确,cut_data应为 year 或 month ')
1038
622
  except Exception as e:
1039
623
  logger.error(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
624
+ # 清理 dataframe 非法值,并转换获取数据类型
625
+ dtypes, df = self.convert_df_dtypes(df)
626
+ if set_typ:
627
+ # 更新自定义的列数据类型
628
+ for k, v in dtypes.copy().items():
629
+ # 确保传进来的 set_typ 键存在于实际的 df 列才 update
630
+ [dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
1040
631
 
1041
632
  connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
1042
633
  if not connection:
1043
634
  return
1044
635
  with connection.cursor() as cursor:
1045
- cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
636
+ cursor.execute("SHOW DATABASES LIKE %s", (db_name,)) # 检查数据库是否存在
1046
637
  database_exists = cursor.fetchone()
1047
638
  if not database_exists:
1048
639
  # 如果数据库不存在,则新建
@@ -1060,960 +651,1783 @@ class MysqlUpload:
1060
651
  sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
1061
652
  cursor.execute(sql, (table_name,))
1062
653
  if not cursor.fetchone():
1063
- sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
1064
- cursor.execute(sql)
654
+ create_table_sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY)"
655
+ cursor.execute(create_table_sql)
1065
656
  logger.info(f'创建 mysql 表: {table_name}')
1066
657
 
1067
- # 根据 dict_data 的值添加指定的数据类型
1068
- dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
1069
- if set_typ:
1070
- # 更新自定义的列数据类型
1071
- for k, v in dtypes.copy().items():
1072
- # 确保传进来的 set_typ 键存在于实际的 df 列才 update
1073
- [dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
1074
-
1075
- # 检查列
658
+ # 有特殊字符不需转义
1076
659
  sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
1077
660
  cursor.execute(sql, (db_name, table_name))
1078
- col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()] # 已存在的所有列
1079
- col_not_exist = [col for col in dict_data.keys() if col not in col_exist] # 不存在的列
1080
- # 不存在则新建列
661
+ col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()]
662
+ cols = df.columns.tolist()
663
+ col_not_exist = [col for col in cols if col not in col_exist]
664
+
665
+ # 检查列,不存在则新建列
1081
666
  if col_not_exist: # 数据表中不存在的列
1082
667
  for col in col_not_exist:
1083
668
  # 创建列,需转义
1084
- if allow_not_null:
1085
- sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]};"
1086
- else:
1087
- sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
1088
-
1089
- cursor.execute(sql)
669
+ alter_sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]}"
670
+ if not allow_not_null:
671
+ alter_sql += " NOT NULL"
672
+ cursor.execute(alter_sql)
1090
673
  logger.info(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
1091
674
 
675
+ # 创建索引
1092
676
  if col == '日期':
1093
- sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
1094
- logger.info(f"设置为索引: {col}({dtypes[col]})")
1095
- cursor.execute(sql)
1096
-
677
+ sql = f"SHOW INDEXES FROM `{table_name}` WHERE `Column_name` = %s"
678
+ cursor.execute(sql, (col,))
679
+ result = cursor.fetchone() # 检查索引是否存在
680
+ if not result:
681
+ cursor.execute(f"CREATE INDEX index_name ON `{table_name}`(`{col}`)")
1097
682
  connection.commit() # 提交事务
1098
- """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
1099
- """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
1100
- # 处理插入的数据
1101
- for dict_data in dict_data_list:
1102
- dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
1103
- if icm_update:
1104
- """ 使用增量更新: 需确保 icm_update['主键'] 传进来的列组合是数据表中唯一,值不会发生变化且不会重复,否则可能产生覆盖 """
1105
- sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
1106
- cursor.execute(sql, (db_name, table_name))
1107
- columns = cursor.fetchall()
1108
- cols_exist = [col['COLUMN_NAME'] for col in columns] # 数据表的所有列, 返回 list
1109
- # 保留原始列名,不提前转义
1110
- raw_update_col = [item for item in cols_exist if item not in icm_update and item != 'id'] # 除了主键外的其他列
1111
683
 
1112
- # 构建条件参数(使用原始列名)
1113
- condition_params = []
1114
- condition_parts = []
1115
- for up_col in icm_update:
1116
- condition_parts.append(f"`{up_col}` = %s") # SQL 转义
1117
- condition_params.append(dict_data[up_col]) # 原始列名用于访问数据
684
+ if df_sql:
685
+ logger.info(f'正在更新: mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count}, {self.filename}')
686
+ engine = create_engine(
687
+ f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
688
+ df.to_sql(
689
+ name=table_name,
690
+ con=engine,
691
+ if_exists='append',
692
+ index=False,
693
+ chunksize=1000,
694
+ method='multi'
695
+ )
696
+ connection.commit() # 提交事务
697
+ connection.close()
698
+ return
1118
699
 
1119
- # 动态转义列名生成 SQL 查询字段
1120
- escaped_update_col = [f'`{col}`' for col in raw_update_col]
1121
- sql = f"""SELECT {','.join(escaped_update_col)} FROM `{table_name}` WHERE {' AND '.join(condition_parts)}"""
1122
- cursor.execute(sql, condition_params)
1123
- results = cursor.fetchall()
700
+ # 5. 移除指定日期范围内的数据,原则上只限于聚合数据使用,原始数据插入时不要设置
701
+ if move_insert and '日期' in df.columns.tolist():
702
+ # 移除数据
703
+ dates = df['日期'].values.tolist()
704
+ dates = [pd.to_datetime(item) for item in dates] # 需要先转换类型才能用 min, max
705
+ start_date = pd.to_datetime(min(dates)).strftime('%Y-%m-%d')
706
+ end_date = (pd.to_datetime(max(dates)) + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
1124
707
 
1125
- if results:
1126
- for result in results:
1127
- change_col = []
1128
- change_placeholders = []
1129
- set_params = []
1130
- for raw_col in raw_update_col:
1131
- # 使用原始列名访问数据
1132
- df_value = str(dict_data[raw_col])
1133
- mysql_value = str(result[raw_col])
708
+ delete_sql = f"""
709
+ DELETE FROM `{table_name}`
710
+ WHERE 日期 BETWEEN %s AND %s
711
+ """
712
+ cursor.execute(delete_sql, (start_date, end_date))
713
+ connection.commit()
1134
714
 
1135
- # 清理小数点后多余的零
1136
- if '.' in df_value:
1137
- df_value = re.sub(r'0+$', '', df_value).rstrip('.')
1138
- if '.' in mysql_value:
1139
- mysql_value = re.sub(r'0+$', '', mysql_value).rstrip('.')
715
+ # 插入数据
716
+ engine = create_engine(
717
+ f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
718
+ df.to_sql(
719
+ name=table_name,
720
+ con=engine,
721
+ if_exists='append',
722
+ index=False,
723
+ chunksize=1000,
724
+ method='multi'
725
+ )
726
+ return
1140
727
 
1141
- if df_value != mysql_value:
1142
- change_placeholders.append(f"`{raw_col}` = %s") # 动态转义列名
1143
- set_params.append(dict_data[raw_col])
1144
- change_col.append(raw_col)
728
+ datas = df.to_dict(orient='records')
729
+ for data in datas:
730
+ # data 是传进来待处理的数据, 不是数据库数据
731
+ # data 示例: {'日期': Timestamp('2024-08-27 00:00:00'), '推广费余额': 33299, '品销宝余额': 2930.73, '短信剩余': 67471}
732
+ try:
733
+ # 预处理数据:转换非字符串类型
734
+ processed_data = {}
735
+ for k, v in data.items():
736
+ if isinstance(v, (int, float)):
737
+ processed_data[k] = float(v)
738
+ elif isinstance(v, pd.Timestamp):
739
+ processed_data[k] = v.strftime('%Y-%m-%d')
740
+ else:
741
+ processed_data[k] = str(v)
1145
742
 
1146
- if change_placeholders:
1147
- full_params = set_params + condition_params
1148
- sql = f"""UPDATE `{table_name}`
1149
- SET {','.join(change_placeholders)}
1150
- WHERE {' AND '.join(condition_parts)}"""
1151
- cursor.execute(sql, full_params)
1152
- else: # 没有数据返回,则直接插入数据
1153
- # 参数化插入
1154
- cols = ', '.join([f'`{k}`' for k in dict_data.keys()])
1155
- placeholders = ', '.join(['%s'] * len(dict_data))
1156
- sql = f"INSERT INTO `{table_name}` ({cols}) VALUES ({placeholders})"
1157
- cursor.execute(sql, tuple(dict_data.values()))
1158
- connection.commit() # 提交数据库
1159
- continue
743
+ # 构建基础SQL要素
744
+ columns = [f'`{k}`' for k in processed_data.keys()]
745
+ placeholders = ', '.join(['%s'] * len(processed_data))
746
+ values = list(processed_data.values())
1160
747
 
1161
- # 标准插入逻辑(参数化修改)
1162
- # 构造更新列(排除主键)
1163
- update_cols = [k for k in dict_data.keys()]
1164
- # 构建SQL
1165
- cols = ', '.join([f'`{k}`' for k in dict_data.keys()])
1166
- placeholders = ', '.join(['%s'] * len(dict_data))
1167
- update_clause = ', '.join([f'`{k}` = VALUES(`{k}`)' for k in update_cols]) or 'id=id'
748
+ # 构建基本INSERT语句
749
+ insert_sql = f"INSERT INTO `{table_name}` ({', '.join(columns)}) VALUES ({placeholders})"
1168
750
 
1169
- sql = f"""INSERT INTO `{table_name}` ({cols}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause}"""
1170
- # 执行参数化查询
1171
- try:
1172
- cursor.execute(sql, tuple(dict_data.values()))
1173
- connection.commit()
1174
- except pymysql.Error as e:
1175
- logger.error(f"插入失败: {e}\nSQL: {cursor.mogrify(sql, tuple(dict_data.values()))}")
1176
- connection.rollback()
1177
- connection.close()
751
+ if icm_update: # 增量更新, 专门用于聚合数据,其他库不要调用
752
+ # 获取数据表结构
753
+ cursor.execute(
754
+ "SELECT COLUMN_NAME FROM information_schema.columns "
755
+ "WHERE table_schema = %s AND table_name = %s",
756
+ (db_name, table_name)
757
+ )
758
+ cols_exist = [row['COLUMN_NAME'] for row in cursor.fetchall()]
759
+ update_columns = [col for col in cols_exist if col not in icm_update and col != 'id']
1178
760
 
1179
- # @try_except
1180
- def dict_to_mysql(self, db_name, table_name, dict_data, icm_update=None, index_length=100, set_typ=None, allow_not_null=False, cut_data=None):
1181
- """
1182
- 插入字典数据
1183
- dict_data: 字典
1184
- index_length: 索引长度
1185
- icm_update: 增量更新
1186
- set_typ: {}
1187
- allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
1188
- """
1189
- if not self.config:
1190
- return
761
+ # 构建WHERE条件
762
+ where_conditions = []
763
+ where_values = []
764
+ for col in icm_update:
765
+ where_conditions.append(f"`{col}` = %s")
766
+ where_values.append(processed_data[col])
1191
767
 
1192
- if cut_data:
1193
- if '日期' in dict_data.keys():
1194
- try:
1195
- __y = pd.to_datetime(dict_data['日期']).strftime('%Y')
1196
- __y_m = pd.to_datetime(dict_data['日期']).strftime('%Y-%m')
1197
- if str(cut_data).lower() == 'year':
1198
- table_name = f'{table_name}_{__y}'
1199
- elif str(cut_data).lower() == 'month':
1200
- table_name = f'{table_name}_{__y_m}'
1201
- else:
1202
- logger.info(f'参数不正确,cut_data应为 year 或 month ')
1203
- except Exception as e:
1204
- logger.error(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
768
+ # 查询现有数据
769
+ select_sql = f"SELECT {', '.join([f'`{col}`' for col in update_columns])} " \
770
+ f"FROM `{table_name}` WHERE {' AND '.join(where_conditions)}"
771
+ cursor.execute(select_sql, where_values)
772
+ existing_data = cursor.fetchone()
1205
773
 
1206
- connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
1207
- if not connection:
1208
- return
1209
- with connection.cursor() as cursor:
1210
- cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
1211
- database_exists = cursor.fetchone()
1212
- if not database_exists:
1213
- # 如果数据库不存在,则新建
1214
- sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
1215
- cursor.execute(sql)
1216
- connection.commit()
1217
- logger.info(f"创建Database: {db_name}")
774
+ if existing_data:
775
+ # 比较并构建更新语句
776
+ update_set = []
777
+ update_values = []
778
+ for col in update_columns:
779
+ db_value = existing_data[col]
780
+ new_value = processed_data[col]
781
+
782
+ # 处理数值类型的精度差异
783
+ if isinstance(db_value, float) and isinstance(new_value, float):
784
+ if not math.isclose(db_value, new_value, rel_tol=1e-9):
785
+ update_set.append(f"`{col}` = %s")
786
+ update_values.append(new_value)
787
+ elif db_value != new_value:
788
+ update_set.append(f"`{col}` = %s")
789
+ update_values.append(new_value)
790
+
791
+ if update_set:
792
+ update_sql = f"UPDATE `{table_name}` SET {', '.join(update_set)} " \
793
+ f"WHERE {' AND '.join(where_conditions)}"
794
+ cursor.execute(update_sql, update_values + where_values)
795
+ else:
796
+ cursor.execute(insert_sql, values)
797
+ else:
798
+ # 普通插入
799
+ cursor.execute(insert_sql, values)
800
+ except Exception as e:
801
+ pass
802
+ connection.commit() # 提交事务
803
+ connection.close()
804
+
805
+
806
+ class OptimizeDatas:
807
+ """
808
+ 数据维护 删除 mysql 的冗余数据
809
+ 更新过程:
810
+ 1. 读取所有数据表
811
+ 2. 遍历表, 遍历列, 如果存在日期列则按天遍历所有日期, 不存在则全表读取
812
+ 3. 按天删除所有冗余数据(存在日期列时)
813
+ tips: 查找冗余数据的方式是创建一个临时迭代器, 逐行读取数据并添加到迭代器, 出现重复时将重复数据的 id 添加到临时列表, 按列表 id 执行删除
814
+ """
815
+ def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
816
+ self.username = username
817
+ self.password = password
818
+ self.host = host
819
+ self.port = port # 默认端口, 此后可能更新,不作为必传参数
820
+ self.charset = charset
821
+ self.config = {
822
+ 'host': self.host,
823
+ 'port': int(self.port),
824
+ 'user': self.username,
825
+ 'password': self.password,
826
+ 'charset': self.charset, # utf8mb4 支持存储四字节的UTF-8字符集
827
+ 'cursorclass': pymysql.cursors.DictCursor,
828
+ }
829
+ self.db_name_lists: list = [] # 更新多个数据库 删除重复数据
830
+ self.db_name = None
831
+ self.days: int = 63 # 对近 N 天的数据进行排重
832
+ self.end_date = None
833
+ self.start_date = None
834
+ self.connection = None
835
+
836
+ @staticmethod
837
+ def try_except(func): # 在类内部定义一个异常处理方法
838
+
839
+ @wraps(func)
840
+ def wrapper(*args, **kwargs):
841
+ try:
842
+ return func(*args, **kwargs)
843
+ except Exception as e:
844
+ logger.error(f'{func.__name__}, {e}') # 将异常信息返回
845
+
846
+ return wrapper
847
+
848
+ def keep_connect(self, _db_name, _config, max_try: int=10):
849
+ attempts = 1
850
+ while attempts <= max_try:
851
+ try:
852
+ connection = pymysql.connect(**_config) # 连接数据库
853
+ return connection
854
+ except Exception as e:
855
+ logger.error(f'{_db_name}连接失败,正在重试: {self.host}:{self.port} {attempts}/{max_try} {e}')
856
+ attempts += 1
857
+ time.sleep(30)
858
+ logger.error(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
859
+ return None
860
+
861
+ def optimize_list(self):
862
+ """
863
+ 更新多个数据库 移除冗余数据
864
+ 需要设置 self.db_name_lists
865
+ """
866
+ if not self.db_name_lists:
867
+ logger.info(f'尚未设置参数: self.db_name_lists')
868
+ return
869
+ for db_name in self.db_name_lists:
870
+ self.db_name = db_name
871
+ self.optimize()
872
+
873
+ def optimize(self, except_key=['更新时间']):
874
+ """ 更新一个数据库 移除冗余数据 """
875
+ if not self.db_name:
876
+ logger.info(f'尚未设置参数: self.db_name')
877
+ return
878
+ tables = self.table_list(db_name=self.db_name)
879
+ if not tables:
880
+ logger.info(f'{self.db_name} -> 数据表不存在')
881
+ return
882
+
883
+ # 日期初始化
884
+ if not self.end_date:
885
+ self.end_date = pd.to_datetime(datetime.datetime.today())
886
+ else:
887
+ self.end_date = pd.to_datetime(self.end_date)
888
+ if self.days:
889
+ self.start_date = pd.to_datetime(self.end_date - datetime.timedelta(days=self.days))
890
+ if not self.start_date:
891
+ self.start_date = self.end_date
892
+ else:
893
+ self.start_date = pd.to_datetime(self.start_date)
894
+ start_date_before = self.start_date
895
+ end_date_before = self.end_date
896
+
897
+ logger.info(f'mysql({self.host}: {self.port}) {self.db_name} 数据库优化中(日期长度: {self.days} 天)...')
898
+ for table_dict in tables:
899
+ for key, table_name in table_dict.items():
900
+ self.config.update({'database': self.db_name}) # 添加更新 config 字段
901
+ self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
902
+ if not self.connection:
903
+ return
904
+ with self.connection.cursor() as cursor:
905
+ sql = f"SELECT 1 FROM `{table_name}` LIMIT 1"
906
+ cursor.execute(sql)
907
+ result = cursor.fetchone()
908
+ if not result:
909
+ logger.info(f'数据表: {table_name}, 数据长度为 0')
910
+ continue # 检查数据表是否为空
911
+
912
+ cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`") # 查询数据表的列信息
913
+ columns = cursor.fetchall()
914
+ date_exist = False
915
+ for col in columns: # 遍历列信息,检查是否存在类型为日期的列
916
+ if col['Field'] == '日期' and (col['Type'] == 'date' or col['Type'].startswith('datetime')):
917
+ date_exist = True
918
+ break
919
+ if date_exist: # 存在日期列
920
+ sql_max = f"SELECT MAX(日期) AS max_date FROM `{table_name}`"
921
+ sql_min = f"SELECT MIN(日期) AS min_date FROM `{table_name}`"
922
+ cursor.execute(sql_max)
923
+ max_result = cursor.fetchone()
924
+ cursor.execute(sql_min)
925
+ min_result = cursor.fetchone()
926
+ # 匹配修改为合适的起始和结束日期
927
+ if self.start_date < pd.to_datetime(min_result['min_date']):
928
+ self.start_date = pd.to_datetime(min_result['min_date'])
929
+ if self.end_date > pd.to_datetime(max_result['max_date']):
930
+ self.end_date = pd.to_datetime(max_result['max_date'])
931
+ dates_list = self.day_list(start_date=self.start_date, end_date=self.end_date)
932
+ # dates_list 是日期列表
933
+ for date in dates_list:
934
+ self.delete_duplicate(table_name=table_name, date=date, except_key=except_key)
935
+ self.start_date = start_date_before # 重置,不然日期错乱
936
+ self.end_date = end_date_before
937
+ else: # 不存在日期列的情况
938
+ self.delete_duplicate2(table_name=table_name, except_key=except_key)
939
+ self.connection.close()
940
+ logger.info(f'mysql({self.host}: {self.port}) {self.db_name} 数据库优化完成!')
941
+
942
+ def delete_duplicate(self, table_name, date, except_key=['更新时间']):
943
+ datas = self.table_datas(db_name=self.db_name, table_name=str(table_name), date=date)
944
+ if not datas:
945
+ return
946
+ duplicate_id = [] # 出现重复的 id
947
+ all_datas = [] # 迭代器
948
+ for data in datas:
949
+ for e_key in except_key:
950
+ if e_key in data.keys(): # 在检查重复数据时,不包含 更新时间 字段
951
+ del data[e_key]
952
+ try:
953
+ delete_id = data['id']
954
+ del data['id']
955
+ data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
956
+ if data in all_datas: # 数据出现重复时
957
+ if delete_id:
958
+ duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
959
+ continue
960
+ all_datas.append(data) # 数据没有重复
961
+ except Exception as e:
962
+ logger.debug(f'{table_name} 函数: mysql - > OptimizeDatas -> delete_duplicate -> {e}')
963
+ del all_datas
964
+
965
+ if not duplicate_id: # 如果没有重复数据,则跳过该数据表
966
+ return
967
+
968
+ try:
969
+ with self.connection.cursor() as cursor:
970
+ placeholders = ', '.join(['%s'] * len(duplicate_id))
971
+ # 移除冗余数据
972
+ sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
973
+ cursor.execute(sql, duplicate_id)
974
+ logger.debug(f"{table_name} -> {date.strftime('%Y-%m-%d')} before: {len(datas)}, remove: {cursor.rowcount}")
975
+ self.connection.commit() # 提交事务
976
+ except Exception as e:
977
+ logger.error(f'{self.db_name}/{table_name}, {e}')
978
+ self.connection.rollback() # 异常则回滚
979
+
980
+ def delete_duplicate2(self, table_name, except_key=['更新时间']):
981
+ with self.connection.cursor() as cursor:
982
+ sql = f"SELECT * FROM `{table_name}`" # 如果不包含日期列,则获取全部数据
983
+ cursor.execute(sql)
984
+ datas = cursor.fetchall()
985
+ if not datas:
986
+ return
987
+ duplicate_id = [] # 出现重复的 id
988
+ all_datas = [] # 迭代器
989
+ for data in datas:
990
+ for e_key in except_key:
991
+ if e_key in data.keys(): # 在检查重复数据时,不包含 更新时间 字段
992
+ del data[e_key]
993
+ delete_id = data['id']
994
+ del data['id']
995
+ data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
996
+ if data in all_datas: # 数据出现重复时
997
+ duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
998
+ continue
999
+ all_datas.append(data) # 数据没有重复
1000
+ del all_datas
1001
+
1002
+ if not duplicate_id: # 如果没有重复数据,则跳过该数据表
1003
+ return
1004
+
1005
+ try:
1006
+ with self.connection.cursor() as cursor:
1007
+ placeholders = ', '.join(['%s'] * len(duplicate_id))
1008
+ # 移除冗余数据
1009
+ sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
1010
+ cursor.execute(sql, duplicate_id)
1011
+ logger.info(f"{table_name} -> before: {len(datas)}, "
1012
+ f"remove: {cursor.rowcount}")
1013
+ self.connection.commit() # 提交事务
1014
+ except Exception as e:
1015
+ logger.error(f'{self.db_name}/{table_name}, {e}')
1016
+ self.connection.rollback() # 异常则回滚
1017
+
1018
+ def database_list(self):
1019
+ """ 获取所有数据库 """
1020
+ connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
1021
+ if not connection:
1022
+ return
1023
+ with connection.cursor() as cursor:
1024
+ cursor.execute("SHOW DATABASES")
1025
+ databases = cursor.fetchall() # 获取所有数据库的结果
1026
+ connection.close()
1027
+ return databases
1028
+
1029
+ def table_list(self, db_name):
1030
+ """ 获取指定数据库的所有数据表 """
1031
+ connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
1032
+ if not connection:
1033
+ return
1034
+ try:
1035
+ with connection.cursor() as cursor:
1036
+ cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
1037
+ database_exists = cursor.fetchone()
1038
+ if not database_exists:
1039
+ logger.info(f'{db_name}: 数据表不存在!')
1040
+ return
1041
+ except Exception as e:
1042
+ logger.error(f'002 {e}')
1043
+ return
1044
+ finally:
1045
+ connection.close() # 断开连接
1218
1046
 
1219
1047
  self.config.update({'database': db_name}) # 添加更新 config 字段
1220
1048
  connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
1221
1049
  if not connection:
1222
1050
  return
1223
1051
  with connection.cursor() as cursor:
1224
- # 1. 查询表, 不存在则创建一个空表
1225
- sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
1226
- cursor.execute(sql, (table_name,))
1227
- if not cursor.fetchone():
1228
- sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
1052
+ cursor.execute("SHOW TABLES")
1053
+ tables = cursor.fetchall() # 获取所有数据表
1054
+ connection.close()
1055
+ return tables
1056
+
1057
+ def table_datas(self, db_name, table_name, date):
1058
+ """
1059
+ 获取指定数据表的数据, 按天获取
1060
+ """
1061
+ self.config.update({'database': db_name}) # 添加更新 config 字段
1062
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
1063
+ if not connection:
1064
+ return
1065
+ try:
1066
+ with connection.cursor() as cursor:
1067
+ sql = f"SELECT * FROM `{table_name}` WHERE {'日期'} BETWEEN '%s' AND '%s'" % (date, date)
1229
1068
  cursor.execute(sql)
1230
- logger.info(f'创建 mysql 表: {table_name}')
1069
+ results = cursor.fetchall()
1070
+ except Exception as e:
1071
+ logger.error(f'001 {e}')
1072
+ finally:
1073
+ connection.close()
1074
+ return results
1231
1075
 
1232
- # 根据 dict_data 的值添加指定的数据类型
1233
- dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
1234
- if set_typ:
1235
- # 更新自定义的列数据类型
1236
- for k, v in dtypes.copy().items():
1237
- # 确保传进来的 set_typ 键存在于实际的 df 列才 update
1238
- [dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
1076
+ def day_list(self, start_date, end_date):
1077
+ start_date = pd.to_datetime(start_date)
1078
+ end_date = pd.to_datetime(end_date)
1079
+ date_list = []
1080
+ while start_date <= end_date:
1081
+ date_list.append(pd.to_datetime(start_date.date()))
1082
+ start_date += datetime.timedelta(days=1)
1083
+ return date_list
1239
1084
 
1240
- # 检查列
1241
- sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
1242
- cursor.execute(sql, (db_name, table_name))
1243
- col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()] # 已存在的所有列
1244
- col_not_exist = [col for col in dict_data.keys() if col not in col_exist] # 不存在的列
1245
- # 不存在则新建列
1246
- if col_not_exist: # 数据表中不存在的列
1247
- for col in col_not_exist:
1248
- # 创建列,需转义
1249
- if allow_not_null:
1250
- sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]};"
1085
+ def rename_column(self):
1086
+ """ 批量修改数据库的列名 """
1087
+ """
1088
+ # for db_name in ['京东数据2', '推广数据2', '市场数据2', '生意参谋2', '生意经2', '属性设置2',]:
1089
+ # s = OptimizeDatas(username=username, password=password, host=host, port=port)
1090
+ # s.db_name = db_name
1091
+ # s.rename_column()
1092
+ """
1093
+ tables = self.table_list(db_name=self.db_name)
1094
+ for table_dict in tables:
1095
+ for key, table_name in table_dict.items():
1096
+ self.config.update({'database': self.db_name}) # 添加更新 config 字段
1097
+ self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
1098
+ if not self.connection:
1099
+ return
1100
+ with self.connection.cursor() as cursor:
1101
+ cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`") # 查询数据表的列信息
1102
+ columns = cursor.fetchall()
1103
+ columns = [{column['Field']: column['Type']} for column in columns]
1104
+ for column in columns:
1105
+ for key, value in column.items():
1106
+ if key.endswith('_'):
1107
+ new_name = re.sub(r'_+$', '', key)
1108
+ sql = f"ALTER TABLE `{table_name}` CHANGE COLUMN {key} {new_name} {value}"
1109
+ cursor.execute(sql)
1110
+ self.connection.commit()
1111
+ if self.connection:
1112
+ self.connection.close()
1113
+
1114
+
1115
+ class StatementCache(OrderedDict):
1116
+ """LRU缓存策略"""
1117
+ def __init__(self, maxsize=100):
1118
+ super().__init__()
1119
+ self.maxsize = maxsize
1120
+
1121
+ def __setitem__(self, key, value):
1122
+ super().__setitem__(key, value)
1123
+ if len(self) > self.maxsize:
1124
+ self.popitem(last=False)
1125
+
1126
+
1127
+ class MySQLUploader:
1128
+ def __init__(
1129
+ self,
1130
+ username: str,
1131
+ password: str,
1132
+ host: str = 'localhost',
1133
+ port: int = 3306,
1134
+ charset: str = 'utf8mb4',
1135
+ collation: str = 'utf8mb4_0900_ai_ci', # utf8mb4_0900_ai_ci: 该排序规则对大小写不敏感, utf8mb4_0900_as_cs/utf8mb4_bin: 对大小写敏感
1136
+ logging_mode: str = 'console', # 'both'(控制台+文件), 'console'(仅控制台), 'file'(仅文件), 'none'(禁用)
1137
+ log_level: str = 'INFO', # 默认日志级别
1138
+ log_file: str = 'mysql_upload.log', # 日志文件路径
1139
+ max_log_size: int = 50, # 日志文件大小(MB)
1140
+ backup_count: int = 5, # 保留的日志文件数量
1141
+ max_retries: int = 10,
1142
+ retry_interval: int = 10,
1143
+ pool_size: int = 5,
1144
+ connect_timeout: int = 10,
1145
+ read_timeout: int = 30,
1146
+ write_timeout: int = 30,
1147
+ ssl: Optional[Dict] = None,
1148
+ enable_metrics: bool = True # 是否启用性能指标收集
1149
+ ):
1150
+ """
1151
+ :param username: 数据库用户名
1152
+ :param password: 数据库密码
1153
+ :param host: 数据库主机地址,默认为localhost
1154
+ :param port: 数据库端口,默认为3306
1155
+ :param charset: 字符集,默认为utf8mb4
1156
+ :param collation: 排序规则,默认为utf8mb4_0900_ai_ci
1157
+ :param logging_mode: 日志模式,可选 'both'(控制台+文件), 'console'(仅控制台), 'file'(仅文件), 'none'(禁用)
1158
+ :param log_level: 日志级别,默认为INFO
1159
+ :param log_file: 日志文件路径
1160
+ :param max_log_size: 日志文件最大大小(MB),默认为50
1161
+ :param backup_count: 保留的日志备份数量,默认为5
1162
+ :param max_retries: 最大重试次数,默认为10
1163
+ :param retry_interval: 重试间隔(秒),默认为10
1164
+ :param pool_size: 连接池大小,默认为5
1165
+ :param connect_timeout: 连接超时(秒),默认为10
1166
+ :param read_timeout: 读取超时(秒),默认为30
1167
+ :param write_timeout: 写入超时(秒),默认为30
1168
+ :param ssl: SSL配置字典,默认为None
1169
+ :param enable_metrics: 是否启用性能指标收集,默认为True
1170
+ """
1171
+ self.username = username
1172
+ self.password = password
1173
+ self.host = host
1174
+ self.port = port
1175
+ self.charset = charset
1176
+ self.collation = collation
1177
+ self.max_retries = max(max_retries, 1)
1178
+ self.retry_interval = max(retry_interval, 1)
1179
+ self.pool_size = max(pool_size, 1)
1180
+ self.connect_timeout = connect_timeout
1181
+ self.read_timeout = read_timeout
1182
+ self.write_timeout = write_timeout
1183
+ self.ssl = ssl
1184
+ self._prepared_statements = StatementCache(maxsize=100)
1185
+ self._max_cached_statements = 100
1186
+ self.enable_metrics = enable_metrics
1187
+ self.metrics = {
1188
+ 'total_uploads': 0,
1189
+ 'successful_uploads': 0,
1190
+ 'failed_uploads': 0,
1191
+ 'total_rows': 0,
1192
+ 'successful_rows': 0,
1193
+ 'failed_rows': 0,
1194
+ 'total_retries': 0,
1195
+ 'total_execution_time': 0.0,
1196
+ 'connection_usage': [],
1197
+ 'memory_usage': [],
1198
+ 'cpu_usage': []
1199
+ }
1200
+ self._last_metrics_time = 0
1201
+ self._metrics_cache = {} # 缓存最近一次的系统指标
1202
+ self.metrics_interval = 30 # 指标采集频率控制
1203
+ self._table_metadata_cache = {} # 元信息缓存
1204
+ self.metadata_cache_ttl = 300 # 元信息缓存频率控制
1205
+
1206
+ # 初始化日志系统
1207
+ self._init_logging(logging_mode, log_level, log_file, max_log_size, backup_count)
1208
+
1209
+ # 创建连接池
1210
+ self.pool = self._create_connection_pool()
1211
+
1212
+ def _init_logging(
1213
+ self,
1214
+ logging_mode: str,
1215
+ log_level: str,
1216
+ log_file: str,
1217
+ max_log_size: int,
1218
+ backup_count: int
1219
+ ):
1220
+ """初始化结构化日志配置"""
1221
+ if logging_mode.lower() == 'none':
1222
+ self.logger = None
1223
+ return
1224
+
1225
+ valid_levels = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
1226
+ level = log_level.upper() if log_level.upper() in valid_levels else 'INFO'
1227
+
1228
+ # 创建格式化器 - 使用结构化JSON格式
1229
+ class StructuredFormatter(logging.Formatter):
1230
+ def format(self, record):
1231
+ log_data = {
1232
+ 'time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
1233
+ 'level': record.levelname,
1234
+ 'message': record.getMessage(),
1235
+ # 'logger': record.name,
1236
+ 'module': record.module,
1237
+ 'line': record.lineno,
1238
+ # 'process': record.process
1239
+ }
1240
+
1241
+ # 添加异常信息
1242
+ if record.exc_info:
1243
+ log_data['exception'] = self.formatException(record.exc_info)
1244
+
1245
+ # 过滤敏感信息
1246
+ if hasattr(record, 'password'):
1247
+ log_data['message'] = log_data['message'].replace(self.password, '***')
1248
+
1249
+ return json.dumps(log_data, ensure_ascii=False)
1250
+
1251
+ # 创建日志记录器
1252
+ self.logger = logging.getLogger('upload')
1253
+ self.logger.setLevel(level)
1254
+
1255
+ # 防止重复添加handler
1256
+ if self.logger.handlers:
1257
+ for handler in self.logger.handlers[:]:
1258
+ self.logger.removeHandler(handler)
1259
+
1260
+ formatter = StructuredFormatter()
1261
+ mode = logging_mode.lower()
1262
+
1263
+ # 根据模式添加相应的handler
1264
+ if mode in ('both', 'console'):
1265
+ console_handler = logging.StreamHandler()
1266
+ console_handler.setFormatter(formatter)
1267
+ self.logger.addHandler(console_handler)
1268
+
1269
+ if mode in ('both', 'file'):
1270
+ file_handler = logging.handlers.RotatingFileHandler(
1271
+ filename=log_file,
1272
+ maxBytes=max_log_size * 1024 * 1024,
1273
+ backupCount=backup_count,
1274
+ encoding='utf-8'
1275
+ )
1276
+ file_handler.setFormatter(formatter)
1277
+ self.logger.addHandler(file_handler)
1278
+
1279
+ def _record_metrics(self, metric_name: str, value: Any = 1, is_timing: bool = False):
1280
+ """记录性能指标"""
1281
+ if not self.enable_metrics:
1282
+ return
1283
+
1284
+ # 对于频繁调用的指标,使用更高效的数据结构
1285
+ if metric_name in ('total_uploads', 'successful_uploads', 'failed_uploads'):
1286
+ self.metrics[metric_name] = self.metrics.get(metric_name, 0) + value
1287
+ return
1288
+
1289
+ if metric_name not in self.metrics:
1290
+ self.metrics[metric_name] = []
1291
+
1292
+ if is_timing:
1293
+ # 如果是时间指标,记录时间戳和值
1294
+ self.metrics[metric_name].append({
1295
+ 'timestamp': datetime.datetime.now().isoformat(),
1296
+ 'value': value
1297
+ })
1298
+ else:
1299
+ # 其他指标直接累加
1300
+ if isinstance(self.metrics[metric_name], (int, float)):
1301
+ self.metrics[metric_name] += value
1302
+ elif isinstance(self.metrics[metric_name], list):
1303
+ self.metrics[metric_name].append({
1304
+ 'timestamp': datetime.datetime.now().isoformat(),
1305
+ 'value': value
1306
+ })
1307
+
1308
+ def _get_system_metrics(self):
1309
+ """获取系统资源使用指标"""
1310
+ if not self.enable_metrics:
1311
+ return {}
1312
+
1313
+ metrics = {
1314
+ 'memory': psutil.virtual_memory().percent,
1315
+ 'cpu': psutil.cpu_percent(),
1316
+ }
1317
+
1318
+ # 更安全的连接数获取方式
1319
+ if hasattr(self, 'pool') and self.pool is not None:
1320
+ try:
1321
+ # 对于不同的连接池实现可能有不同的属性名
1322
+ if hasattr(self.pool, '_connections'):
1323
+ connections = self.pool._connections
1324
+ metrics['connections'] = len(connections) if hasattr(connections, '__len__') else 0
1325
+ else:
1326
+ metrics['connections'] = 0
1327
+ except Exception:
1328
+ metrics['connections'] = 0
1329
+ else:
1330
+ metrics['connections'] = 0
1331
+
1332
+ return metrics
1333
+
1334
+ def _log_with_metrics(self, level: str, message: str, extra: Optional[Dict] = None):
1335
+ """日志记录"""
1336
+ if not self.logger:
1337
+ return
1338
+
1339
+ if len(message) > 500:
1340
+ message = message[:500] + '...'
1341
+
1342
+ now = time.time()
1343
+ if now - self._last_metrics_time > self.metrics_interval:
1344
+ self._metrics_cache = self._get_system_metrics()
1345
+ # 使用缓存的指标
1346
+ log_extra = {'metrics': self._metrics_cache}
1347
+ self._last_metrics_time = now
1348
+ else:
1349
+ # 记录系统指标
1350
+ metrics = self._get_system_metrics()
1351
+ log_extra = {'metrics': metrics}
1352
+
1353
+ if extra:
1354
+ log_extra.update(extra)
1355
+
1356
+ getattr(self.logger, level.lower())(message, extra={'extra_data': log_extra})
1357
+
1358
+ def _create_connection_pool(self) -> PooledDB:
1359
+ """创建数据库连接池"""
1360
+ if hasattr(self, 'pool') and self.pool is not None and self._check_pool_health():
1361
+ return self.pool
1362
+
1363
+ start_time = time.time()
1364
+ self.pool = None
1365
+
1366
+ pool_params = {
1367
+ 'creator': pymysql,
1368
+ 'host': self.host,
1369
+ 'port': self.port,
1370
+ 'user': self.username,
1371
+ 'password': self.password,
1372
+ 'charset': self.charset,
1373
+ 'cursorclass': pymysql.cursors.DictCursor,
1374
+ 'maxconnections': self.pool_size,
1375
+ 'ping': 7,
1376
+ 'connect_timeout': self.connect_timeout,
1377
+ 'read_timeout': self.read_timeout,
1378
+ 'write_timeout': self.write_timeout,
1379
+ 'autocommit': False
1380
+ }
1381
+
1382
+ if self.ssl:
1383
+ required_keys = {'ca', 'cert', 'key'}
1384
+ if not all(k in self.ssl for k in required_keys):
1385
+ error_msg = "SSL配置必须包含ca、cert和key"
1386
+ self._log_with_metrics('error', error_msg)
1387
+ raise ValueError(error_msg)
1388
+ pool_params['ssl'] = {
1389
+ 'ca': self.ssl['ca'],
1390
+ 'cert': self.ssl['cert'],
1391
+ 'key': self.ssl['key'],
1392
+ 'check_hostname': self.ssl.get('check_hostname', False)
1393
+ }
1394
+
1395
+ try:
1396
+ pool = PooledDB(**pool_params)
1397
+ elapsed = time.time() - start_time
1398
+ self._record_metrics('connection_pool_creation_time', elapsed, is_timing=True)
1399
+ self._log_with_metrics('info', "连接池创建成功", {
1400
+ 'pool_size': self.pool_size,
1401
+ 'time_elapsed': elapsed
1402
+ })
1403
+ return pool
1404
+ except Exception as e:
1405
+ elapsed = time.time() - start_time
1406
+ self._record_metrics('connection_pool_failures', 1)
1407
+ self.pool = None
1408
+ self._log_with_metrics('error', "连接池创建失败", {
1409
+ 'error': str(e),
1410
+ 'time_elapsed': elapsed
1411
+ })
1412
+ raise ConnectionError(f"连接池创建失败: {str(e)}")
1413
+
1414
+ def _execute_with_retry(self, func):
1415
+ @wraps(func)
1416
+ def wrapper(*args, **kwargs):
1417
+ last_exception = None
1418
+ start_time = time.time()
1419
+ operation = func.__name__
1420
+
1421
+ self._log_with_metrics('debug', f"开始执行操作: {operation}", {
1422
+ 'attempt': 1,
1423
+ 'max_retries': self.max_retries
1424
+ })
1425
+
1426
+ for attempt in range(self.max_retries):
1427
+ try:
1428
+ result = func(*args, **kwargs)
1429
+ elapsed = time.time() - start_time
1430
+
1431
+ if attempt > 0:
1432
+ self._record_metrics('total_retries', attempt)
1433
+ self._log_with_metrics('info', "操作成功(重试后)", {
1434
+ 'operation': operation,
1435
+ 'attempts': attempt + 1,
1436
+ 'time_elapsed': elapsed
1437
+ })
1251
1438
  else:
1252
- sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
1253
- cursor.execute(sql)
1254
- logger.info(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
1439
+ self._log_with_metrics('debug', "操作成功", {
1440
+ 'operation': operation,
1441
+ 'time_elapsed': elapsed
1442
+ })
1255
1443
 
1256
- if col == '日期':
1257
- sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
1258
- logger.info(f"设置为索引: {col}({dtypes[col]})")
1259
- cursor.execute(sql)
1260
- connection.commit() # 提交事务
1261
- """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
1262
- """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
1263
- # 处理插入的数据
1264
- if icm_update:
1265
- """ 使用增量更新: 需确保 icm_update['主键'] 传进来的列组合是数据表中唯一,值不会发生变化且不会重复,否则可能产生覆盖 """
1266
- sql = """SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s"""
1267
- cursor.execute(sql, (db_name, table_name))
1268
- cols_exist = [col['COLUMN_NAME'] for col in cursor.fetchall()] # 数据表的所有列, 返回 list
1444
+ return result
1269
1445
 
1270
- # 保留原始列名,不提前转义
1271
- raw_update_col = [item for item in cols_exist if item not in icm_update and item != 'id']
1446
+ except (pymysql.OperationalError, pymysql.err.MySQLError) as e:
1447
+ last_exception = e
1448
+ self._record_metrics('database_errors', 1)
1272
1449
 
1273
- # 构建条件参数(使用原始列名)
1274
- condition_params = []
1275
- condition_parts = []
1276
- for up_col in icm_update:
1277
- condition_parts.append(f"`{up_col}` = %s") # SQL 转义
1278
- condition_params.append(dict_data[up_col]) # 原始列名访问数据
1450
+ # 记录详细的MySQL错误信息
1451
+ error_details = {
1452
+ 'operation': operation,
1453
+ 'error_code': e.args[0] if e.args else None,
1454
+ 'error_message': e.args[1] if len(e.args) > 1 else None,
1455
+ 'attempt': attempt + 1,
1456
+ 'max_retries': self.max_retries
1457
+ }
1279
1458
 
1280
- # 动态转义列名生成 SQL 查询字段
1281
- escaped_update_col = [f'`{col}`' for col in raw_update_col]
1282
- sql = f"""SELECT {','.join(escaped_update_col)} FROM `{table_name}` WHERE {' AND '.join(condition_parts)}"""
1283
- cursor.execute(sql, condition_params)
1284
- results = cursor.fetchall()
1459
+ if attempt < self.max_retries - 1:
1460
+ wait_time = self.retry_interval * (attempt + 1)
1461
+ error_details['wait_time'] = wait_time
1462
+ self._log_with_metrics('warning', f"数据库操作失败,准备重试 {error_details}", )
1463
+ time.sleep(wait_time)
1285
1464
 
1286
- if results:
1287
- for result in results:
1288
- change_col = []
1289
- change_placeholders = []
1290
- set_params = []
1291
- for raw_col in raw_update_col:
1292
- # 使用原始列名访问数据
1293
- df_value = str(dict_data[raw_col])
1294
- mysql_value = str(result[raw_col])
1465
+ # 尝试重新连接
1466
+ try:
1467
+ self.pool = self._create_connection_pool()
1468
+ self._log_with_metrics('info', "成功重新建立数据库连接")
1469
+ except Exception as reconnect_error:
1470
+ self._log_with_metrics('error', "重连失败", {
1471
+ 'error': str(reconnect_error)
1472
+ })
1473
+ else:
1474
+ elapsed = time.time() - start_time
1475
+ error_details['time_elapsed'] = elapsed
1476
+ self._log_with_metrics('error', f"操作最终失败 {error_details}")
1295
1477
 
1296
- # 清理小数点后多余的零
1297
- if '.' in df_value:
1298
- df_value = re.sub(r'0+$', '', df_value).rstrip('.')
1299
- if '.' in mysql_value:
1300
- mysql_value = re.sub(r'0+$', '', mysql_value).rstrip('.')
1478
+ except pymysql.IntegrityError as e:
1479
+ elapsed = time.time() - start_time
1480
+ self._record_metrics('integrity_errors', 1)
1481
+ self._log_with_metrics('error', "完整性约束错误", {
1482
+ 'operation': operation,
1483
+ 'time_elapsed': elapsed,
1484
+ 'error_code': e.args[0] if e.args else None,
1485
+ 'error_message': e.args[1] if len(e.args) > 1 else None
1486
+ })
1487
+ raise e
1301
1488
 
1302
- if df_value != mysql_value:
1303
- change_placeholders.append(f"`{raw_col}` = %s") # 动态转义列名
1304
- set_params.append(dict_data[raw_col])
1305
- change_col.append(raw_col)
1489
+ except Exception as e:
1490
+ last_exception = e
1491
+ elapsed = time.time() - start_time
1492
+ self._record_metrics('unexpected_errors', 1)
1493
+ self._log_with_metrics('error', "发生意外错误", {
1494
+ 'operation': operation,
1495
+ 'time_elapsed': elapsed,
1496
+ 'error_type': type(e).__name__,
1497
+ 'error_message': str(e),
1498
+ 'error_args': e.args if hasattr(e, 'args') else None
1499
+ })
1500
+ break
1306
1501
 
1307
- if change_placeholders:
1308
- full_params = set_params + condition_params
1309
- sql = f"""UPDATE `{table_name}`
1310
- SET {','.join(change_placeholders)}
1311
- WHERE {' AND '.join(condition_parts)}"""
1312
- cursor.execute(sql, full_params)
1313
- else: # 没有数据返回,则直接插入数据
1314
- # 参数化插入语句
1315
- keys = [f"`{k}`" for k in dict_data.keys()]
1316
- placeholders = ','.join(['%s'] * len(dict_data))
1317
- update_clause = ','.join([f"`{k}`=VALUES(`{k}`)" for k in dict_data.keys()])
1318
- sql = f"""INSERT INTO `{table_name}` ({','.join(keys)}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause}"""
1319
- cursor.execute(sql, tuple(dict_data.values()))
1320
- connection.commit() # 提交数据库
1321
- connection.close()
1322
- return
1502
+ raise last_exception if last_exception else Exception("发生未知错误")
1323
1503
 
1324
- # 常规插入处理(参数化)
1325
- keys = [f"`{k}`" for k in dict_data.keys()]
1326
- placeholders = ','.join(['%s'] * len(dict_data))
1327
- update_clause = ','.join([f"`{k}`=VALUES(`{k}`)" for k in dict_data.keys()])
1328
- sql = f"""INSERT INTO `{table_name}` ({','.join(keys)}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause}"""
1329
- cursor.execute(sql, tuple(dict_data.values()))
1330
- connection.commit()
1331
- connection.close()
1504
+ return wrapper
1332
1505
 
1333
- def cover_dict_dtypes(self, dict_data):
1334
- """ 清理字典键值 并转换数据类型 """
1335
- if not dict_data:
1336
- logger.info(f'mysql.py -> MysqlUpload -> cover_dict_dtypes -> 传入的字典不能为空')
1337
- return
1338
- __res_dict = {}
1339
- new_dict_data = {}
1340
- for k, v in dict_data.items():
1341
- k = str(k).lower()
1342
- k = re.sub(r'[()\-,,$&~^、 ()\"\'“”=·/。》《><!!`]', '_', k, re.IGNORECASE)
1343
- k = k.replace(')', '')
1344
- k = re.sub(r'_{2,}', '_', k)
1345
- k = re.sub(r'_+$', '', k)
1346
- if str(v) == '':
1347
- v = 0
1348
- v = str(v)
1349
- v = re.sub('^="|"$', '', v, re.I)
1350
- v = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', str(v)) # 移除控制字符
1351
- if re.findall(r'^[-+]?\d+\.?\d*%$', v):
1352
- v = str(float(v.rstrip("%")) / 100)
1506
+ def _get_connection(self):
1507
+ """从连接池获取连接"""
1508
+ try:
1509
+ conn = self.pool.connection()
1510
+ self._log_with_metrics('debug', "获取数据库连接")
1511
+ return conn
1512
+ except Exception as e:
1513
+ self._log_with_metrics("error", f'{e}')
1514
+ raise ConnectionError(f"连接数据库失败: {str(e)}")
1353
1515
 
1354
- result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
1355
- result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
1356
- result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
1357
- result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
1516
+ def _check_database_exists(self, db_name: str) -> bool:
1517
+ """检查数据库是否存在"""
1518
+ db_name = self._validate_identifier(db_name)
1519
+ sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
1358
1520
 
1359
- date_type = otk.is_valid_date(v) # 判断日期时间
1360
- int_num = otk.is_integer(v) # 判断整数
1361
- count_int, count_float = count_decimal_places(v) # 判断小数,返回小数位数
1362
- if result1: # 京东sku/spu商品信息
1363
- __res_dict.update({k: 'varchar(100)'})
1364
- elif k == '日期':
1365
- __res_dict.update({k: 'DATE'})
1366
- elif k == '更新时间':
1367
- __res_dict.update({k: 'TIMESTAMP'})
1368
- elif result2: # 小数
1369
- __res_dict.update({k: 'decimal(10,4)'})
1370
- elif date_type == 1: # 纯日期
1371
- __res_dict.update({k: 'DATE'})
1372
- elif date_type == 2: # 日期+时间
1373
- __res_dict.update({k: 'DATETIME'})
1374
- elif int_num:
1375
- __res_dict.update({k: 'INT'})
1376
- elif count_float > 0:
1377
- if count_int + count_float > 10:
1378
- # if count_float > 5:
1379
- # v = round(float(v), 4)
1380
- if count_float >= 6:
1381
- __res_dict.update({k: 'decimal(14,6)'})
1382
- else:
1383
- __res_dict.update({k: 'decimal(14,4)'})
1384
- elif count_float >= 6:
1385
- __res_dict.update({k: 'decimal(14,6)'})
1386
- elif count_float >= 4:
1387
- __res_dict.update({k: 'decimal(12,4)'})
1388
- else:
1389
- __res_dict.update({k: 'decimal(10,2)'})
1390
- else:
1391
- __res_dict.update({k: 'varchar(255)'})
1392
- new_dict_data.update({k: v})
1393
- return __res_dict, new_dict_data
1521
+ try:
1522
+ with self._get_connection() as conn:
1523
+ with conn.cursor() as cursor:
1524
+ cursor.execute(sql, (db_name,))
1525
+ exists = bool(cursor.fetchone())
1526
+ self._log_with_metrics('debug', f"{db_name} 数据库已存在: {exists}")
1527
+ return exists
1528
+ except Exception as e:
1529
+ self._log_with_metrics('error', f"检查数据库是否存在时出错: {str(e)}")
1530
+ raise
1394
1531
 
1395
- def convert_df_dtypes(self, df: pd.DataFrame):
1396
- """ 清理 df 的值和列名,并转换数据类型 """
1397
- df = otk.cover_df(df=df) # 清理 df 的值和列名
1398
- [pd.to_numeric(df[col], errors='ignore') for col in df.columns.tolist()]
1399
- dtypes = df.dtypes.to_dict()
1400
- __res_dict = {}
1401
- for k, v in dtypes.copy().items():
1402
- result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
1403
- result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
1404
- result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
1405
- result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
1532
+ def _create_database(self, db_name: str):
1533
+ """创建数据库"""
1534
+ db_name = self._validate_identifier(db_name)
1535
+ sql = f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET {self.charset} COLLATE {self.collation}"
1406
1536
 
1407
- if result1: # id/sku/spu商品信息
1408
- __res_dict.update({k: 'varchar(50)'})
1409
- elif result2: # 小数
1410
- __res_dict.update({k: 'decimal(10,4)'})
1411
- elif result3: # 小数
1412
- __res_dict.update({k: 'decimal(12,4)'})
1413
- elif result4: # 小数
1414
- __res_dict.update({k: 'decimal(12,2)'})
1415
- elif k == '日期':
1416
- __res_dict.update({k: 'date'})
1417
- elif k == '更新时间':
1418
- __res_dict.update({k: 'timestamp'})
1419
- elif v == 'int64':
1420
- __res_dict.update({k: 'int'})
1421
- elif v == 'float64':
1422
- __res_dict.update({k: 'decimal(10,4)'})
1423
- elif v == 'bool':
1424
- __res_dict.update({k: 'boolean'})
1425
- elif v == 'datetime64[ns]':
1426
- __res_dict.update({k: 'datetime'})
1427
- else:
1428
- __res_dict.update({k: 'varchar(255)'})
1429
- return __res_dict, df
1537
+ try:
1538
+ with self._get_connection() as conn:
1539
+ with conn.cursor() as cursor:
1540
+ cursor.execute(sql)
1541
+ conn.commit()
1542
+ self._log_with_metrics('info', f"{db_name} 数据库已创建")
1543
+ except Exception as e:
1544
+ self._log_with_metrics('error', f"{db_name}: 无法创建数据库 {str(e)}")
1545
+ conn.rollback()
1546
+ raise
1430
1547
 
1431
- @try_except
1432
- def df_to_mysql(self, df, db_name, table_name, set_typ=None, icm_update=[], move_insert=False, df_sql=False,
1433
- filename=None, count=None, allow_not_null=False, cut_data=None):
1548
+ def _get_partition_table_name(self, table_name: str, date_value: str, partition_by: str) -> str:
1434
1549
  """
1435
- db_name: 数据库名
1436
- table_name: 表名
1437
- move_insert: 根据df 的日期,先移除数据库数据,再插入, df_sql, icm_update 都要设置为 False
1438
- 原则上只限于聚合数据使用,原始数据插入时不要设置
1439
- df_sql: 这是一个临时参数, 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重,初创表大量上传数据的时候使用
1440
- icm_update: 增量更新, 在聚合数据中使用,原始文件不要使用
1441
- 使用增量更新: 必须确保 icm_update 传进来的列必须是数据表中唯一主键,值不会发生变化,不会重复,否则可能产生错乱覆盖情况
1442
- filename: 用来追踪处理进度,传这个参数是方便定位产生错误的文件
1443
- allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
1550
+ 获取分表名称
1551
+
1552
+ :param table_name: 基础表名
1553
+ :param date_value: 日期值
1554
+ :param partition_by: 分表方式 ('year' 'month')
1555
+ :return: 分表名称
1556
+ :raises ValueError: 如果日期格式无效或分表方式无效
1444
1557
  """
1445
- if not self.config:
1446
- return
1447
- if icm_update:
1448
- if move_insert or df_sql:
1449
- logger.info(f'icm_update/move_insert/df_sql 参数不能同时设定')
1450
- return
1451
- if move_insert:
1452
- if icm_update or df_sql:
1453
- logger.info(f'icm_update/move_insert/df_sql 参数不能同时设定')
1454
- return
1558
+ try:
1559
+ # date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d %H:%M:%S')
1560
+ date_obj = self._validate_datetime(date_value, True)
1561
+ except ValueError:
1562
+ try:
1563
+ # date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d')
1564
+ date_obj = self._validate_datetime(date_value, True)
1565
+ except ValueError:
1566
+ error_msg = f"无效的日期格式1: {date_value}"
1567
+ self._log_with_metrics('error', error_msg)
1568
+ raise ValueError(error_msg)
1455
1569
 
1456
- self.filename = filename
1457
- if isinstance(df, pd.DataFrame):
1458
- if len(df) == 0:
1459
- logger.info(f'{db_name}: {table_name} 传入的 df 数据长度为0, {self.filename}')
1460
- return
1570
+ if partition_by == 'year':
1571
+ return f"{table_name}_{date_obj.year}"
1572
+ elif partition_by == 'month':
1573
+ return f"{table_name}_{date_obj.year}_{date_obj.month:02d}"
1461
1574
  else:
1462
- logger.info(f'{db_name}: {table_name} 传入的 df 不是有效的 dataframe 结构, {self.filename}')
1463
- return
1464
- if not db_name or db_name == 'None':
1465
- logger.info(f'{db_name} 不能为 None')
1466
- return
1467
-
1468
- if cut_data:
1469
- if '日期' in df.columns.tolist():
1470
- try:
1471
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
1472
- min_year = df['日期'].min(skipna=True).year
1473
- min_month = df['日期'].min(skipna=True).month
1474
- if 0 < int(min_month) < 10 and not str(min_month).startswith('0'):
1475
- min_month = f'0{min_month}'
1476
- if str(cut_data).lower() == 'year':
1477
- table_name = f'{table_name}_{min_year}'
1478
- elif str(cut_data).lower() == 'month':
1479
- table_name = f'{table_name}_{min_year}-{min_month}'
1480
- else:
1481
- logger.info(f'参数不正确,cut_data应为 year 或 month ')
1482
- except Exception as e:
1483
- logger.error(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
1484
- # 清理 dataframe 非法值,并转换获取数据类型
1485
- dtypes, df = self.convert_df_dtypes(df)
1486
- if set_typ:
1487
- # 更新自定义的列数据类型
1488
- for k, v in dtypes.copy().items():
1489
- # 确保传进来的 set_typ 键存在于实际的 df 列才 update
1490
- [dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
1575
+ error_msg = "partition_by must be 'year' or 'month'"
1576
+ self._log_with_metrics('error', error_msg)
1577
+ raise ValueError(error_msg)
1491
1578
 
1492
- connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
1493
- if not connection:
1494
- return
1495
- with connection.cursor() as cursor:
1496
- cursor.execute("SHOW DATABASES LIKE %s", (db_name,)) # 检查数据库是否存在
1497
- database_exists = cursor.fetchone()
1498
- if not database_exists:
1499
- # 如果数据库不存在,则新建
1500
- sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
1501
- cursor.execute(sql)
1502
- connection.commit()
1503
- logger.info(f"创建Database: {db_name}")
1579
+ def _validate_identifier(self, identifier: str) -> str:
1580
+ """
1581
+ 验证并清理数据库标识符(数据库名、表名、列名)
1582
+ 防止SQL注入和非法字符
1504
1583
 
1505
- self.config.update({'database': db_name}) # 添加更新 config 字段
1506
- connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
1507
- if not connection:
1508
- return
1509
- with connection.cursor() as cursor:
1510
- # 1. 查询表, 不存在则创建一个空表
1511
- sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
1512
- cursor.execute(sql, (table_name,))
1513
- if not cursor.fetchone():
1514
- create_table_sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY)"
1515
- cursor.execute(create_table_sql)
1516
- logger.info(f'创建 mysql 表: {table_name}')
1584
+ :param identifier: 要验证的标识符
1585
+ :return: 清理后的安全标识符
1586
+ :raises ValueError: 如果标识符无效
1587
+ """
1588
+ if not identifier or not isinstance(identifier, str):
1589
+ error_msg = f"无效的标识符: {identifier}"
1590
+ self._log_with_metrics('error', error_msg)
1591
+ raise ValueError(error_msg)
1517
1592
 
1518
- # 有特殊字符不需转义
1519
- sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
1520
- cursor.execute(sql, (db_name, table_name))
1521
- col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()]
1522
- cols = df.columns.tolist()
1523
- col_not_exist = [col for col in cols if col not in col_exist]
1593
+ # 移除非法字符,只保留字母、数字、下划线和美元符号
1594
+ cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '', identifier)
1595
+ if not cleaned:
1596
+ error_msg = f"无法清理异常标识符: {identifier}"
1597
+ self._log_with_metrics('error', error_msg)
1598
+ raise ValueError(error_msg)
1524
1599
 
1525
- # 检查列,不存在则新建列
1526
- if col_not_exist: # 数据表中不存在的列
1527
- for col in col_not_exist:
1528
- # 创建列,需转义
1529
- alter_sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]}"
1530
- if not allow_not_null:
1531
- alter_sql += " NOT NULL"
1532
- cursor.execute(alter_sql)
1533
- logger.info(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
1600
+ # 检查是否为MySQL保留字
1601
+ mysql_keywords = {
1602
+ 'select', 'insert', 'update', 'delete', 'from', 'where', 'and', 'or',
1603
+ 'not', 'like', 'in', 'is', 'null', 'true', 'false', 'between'
1604
+ }
1605
+ if cleaned.lower() in mysql_keywords:
1606
+ self._log_with_metrics('debug', f"存在MySQL保留字: {cleaned}")
1607
+ return f"`{cleaned}`"
1534
1608
 
1535
- # 创建索引
1536
- if col == '日期':
1537
- sql = f"SHOW INDEXES FROM `{table_name}` WHERE `Column_name` = %s"
1538
- cursor.execute(sql, (col,))
1539
- result = cursor.fetchone() # 检查索引是否存在
1540
- if not result:
1541
- cursor.execute(f"CREATE INDEX index_name ON `{table_name}`(`{col}`)")
1542
- connection.commit() # 提交事务
1609
+ return cleaned
1543
1610
 
1544
- if df_sql:
1545
- logger.info(f'正在更新: mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count}, {self.filename}')
1546
- engine = create_engine(
1547
- f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
1548
- df.to_sql(
1549
- name=table_name,
1550
- con=engine,
1551
- if_exists='append',
1552
- index=False,
1553
- chunksize=1000,
1554
- method='multi'
1555
- )
1556
- connection.commit() # 提交事务
1557
- connection.close()
1558
- return
1611
+ def _check_table_exists(self, db_name: str, table_name: str) -> bool:
1612
+ """检查表是否存在"""
1613
+ cache_key = f"{db_name}.{table_name}"
1614
+ if cache_key in self._table_metadata_cache:
1615
+ cached_time, result = self._table_metadata_cache[cache_key]
1616
+ if time.time() - cached_time < self.metadata_cache_ttl:
1617
+ return result
1559
1618
 
1560
- # 5. 移除指定日期范围内的数据,原则上只限于聚合数据使用,原始数据插入时不要设置
1561
- if move_insert and '日期' in df.columns.tolist():
1562
- # 移除数据
1563
- dates = df['日期'].values.tolist()
1564
- dates = [pd.to_datetime(item) for item in dates] # 需要先转换类型才能用 min, max
1565
- start_date = pd.to_datetime(min(dates)).strftime('%Y-%m-%d')
1566
- end_date = (pd.to_datetime(max(dates)) + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
1619
+ db_name = self._validate_identifier(db_name)
1620
+ table_name = self._validate_identifier(table_name)
1621
+ sql = """
1622
+ SELECT TABLE_NAME
1623
+ FROM INFORMATION_SCHEMA.TABLES
1624
+ WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
1625
+ """
1567
1626
 
1568
- delete_sql = f"""
1569
- DELETE FROM `{table_name}`
1570
- WHERE 日期 BETWEEN %s AND %s
1571
- """
1572
- cursor.execute(delete_sql, (start_date, end_date))
1573
- connection.commit()
1627
+ try:
1628
+ with self._get_connection() as conn:
1629
+ with conn.cursor() as cursor:
1630
+ cursor.execute(sql, (db_name, table_name))
1631
+ result = bool(cursor.fetchone())
1632
+ except Exception as e:
1633
+ self._log_with_metrics('error', f"检查数据表是否存在时发生未知错误: {e}", )
1634
+ raise
1574
1635
 
1575
- # 插入数据
1576
- engine = create_engine(
1577
- f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
1578
- df.to_sql(
1579
- name=table_name,
1580
- con=engine,
1581
- if_exists='append',
1582
- index=False,
1583
- chunksize=1000,
1584
- method='multi'
1585
- )
1586
- return
1636
+ # 执行查询并缓存结果
1637
+ self._table_metadata_cache[cache_key] = (time.time(), result)
1638
+ return result
1587
1639
 
1588
- datas = df.to_dict(orient='records')
1589
- for data in datas:
1590
- # data 是传进来待处理的数据, 不是数据库数据
1591
- # data 示例: {'日期': Timestamp('2024-08-27 00:00:00'), '推广费余额': 33299, '品销宝余额': 2930.73, '短信剩余': 67471}
1592
- try:
1593
- # 预处理数据:转换非字符串类型
1594
- processed_data = {}
1595
- for k, v in data.items():
1596
- if isinstance(v, (int, float)):
1597
- processed_data[k] = float(v)
1598
- elif isinstance(v, pd.Timestamp):
1599
- processed_data[k] = v.strftime('%Y-%m-%d')
1600
- else:
1601
- processed_data[k] = str(v)
1640
+ def _create_table(
1641
+ self,
1642
+ db_name: str,
1643
+ table_name: str,
1644
+ set_typ: Dict[str, str],
1645
+ primary_keys: Optional[List[str]] = None,
1646
+ date_column: Optional[str] = None,
1647
+ indexes: Optional[List[str]] = None,
1648
+ allow_null: bool = False
1649
+ ):
1650
+ """
1651
+ 创建数据表
1602
1652
 
1603
- # 构建基础SQL要素
1604
- columns = [f'`{k}`' for k in processed_data.keys()]
1605
- placeholders = ', '.join(['%s'] * len(processed_data))
1606
- values = list(processed_data.values())
1653
+ :param db_name: 数据库名
1654
+ :param table_name: 表名
1655
+ :param set_typ: 列名和数据类型字典 {列名: 数据类型}
1656
+ :param primary_keys: 主键列列表
1657
+ :param date_column: 日期列名,如果存在将设置为索引
1658
+ :param indexes: 需要创建索引的列列表
1659
+ """
1660
+ db_name = self._validate_identifier(db_name)
1661
+ table_name = self._validate_identifier(table_name)
1607
1662
 
1608
- # 构建基本INSERT语句
1609
- insert_sql = f"INSERT INTO `{table_name}` ({', '.join(columns)}) VALUES ({placeholders})"
1663
+ if not set_typ:
1664
+ error_msg = "No columns specified for table creation"
1665
+ self._log_with_metrics('error', error_msg)
1666
+ raise ValueError(error_msg)
1610
1667
 
1611
- if icm_update: # 增量更新, 专门用于聚合数据,其他库不要调用
1612
- # 获取数据表结构
1613
- cursor.execute(
1614
- "SELECT COLUMN_NAME FROM information_schema.columns "
1615
- "WHERE table_schema = %s AND table_name = %s",
1616
- (db_name, table_name)
1617
- )
1618
- cols_exist = [row['COLUMN_NAME'] for row in cursor.fetchall()]
1619
- update_columns = [col for col in cols_exist if col not in icm_update and col != 'id']
1668
+ # 构建列定义SQL
1669
+ column_defs = ["`id` INT NOT NULL AUTO_INCREMENT"]
1620
1670
 
1621
- # 构建WHERE条件
1622
- where_conditions = []
1623
- where_values = []
1624
- for col in icm_update:
1625
- where_conditions.append(f"`{col}` = %s")
1626
- where_values.append(processed_data[col])
1671
+ # 添加其他列定义
1672
+ for col_name, col_type in set_typ.items():
1673
+ # 跳过id列,因为已经在前面添加了
1674
+ if col_name.lower() == 'id':
1675
+ continue
1676
+ safe_col_name = self._validate_identifier(col_name)
1677
+ col_def = f"`{safe_col_name}` {col_type}"
1627
1678
 
1628
- # 查询现有数据
1629
- select_sql = f"SELECT {', '.join([f'`{col}`' for col in update_columns])} " \
1630
- f"FROM `{table_name}` WHERE {' AND '.join(where_conditions)}"
1631
- cursor.execute(select_sql, where_values)
1632
- existing_data = cursor.fetchone()
1679
+ # 根据allow_null决定是否添加NOT NULL约束
1680
+ if not allow_null and not col_type.lower().startswith('json'):
1681
+ col_def += " NOT NULL"
1633
1682
 
1634
- if existing_data:
1635
- # 比较并构建更新语句
1636
- update_set = []
1637
- update_values = []
1638
- for col in update_columns:
1639
- db_value = existing_data[col]
1640
- new_value = processed_data[col]
1683
+ column_defs.append(col_def)
1641
1684
 
1642
- # 处理数值类型的精度差异
1643
- if isinstance(db_value, float) and isinstance(new_value, float):
1644
- if not math.isclose(db_value, new_value, rel_tol=1e-9):
1645
- update_set.append(f"`{col}` = %s")
1646
- update_values.append(new_value)
1647
- elif db_value != new_value:
1648
- update_set.append(f"`{col}` = %s")
1649
- update_values.append(new_value)
1685
+ # 添加主键定义
1686
+ if primary_keys:
1687
+ # 确保id在主键中
1688
+ if 'id' not in [pk.lower() for pk in primary_keys]:
1689
+ primary_keys = ['id'] + primary_keys
1690
+ else:
1691
+ # 如果没有指定主键,则使用id作为主键
1692
+ primary_keys = ['id']
1650
1693
 
1651
- if update_set:
1652
- update_sql = f"UPDATE `{table_name}` SET {', '.join(update_set)} " \
1653
- f"WHERE {' AND '.join(where_conditions)}"
1654
- cursor.execute(update_sql, update_values + where_values)
1655
- else:
1656
- cursor.execute(insert_sql, values)
1657
- else:
1658
- # 普通插入
1659
- cursor.execute(insert_sql, values)
1660
- except Exception as e:
1661
- pass
1662
- connection.commit() # 提交事务
1663
- connection.close()
1694
+ # 添加主键定义
1695
+ safe_primary_keys = [self._validate_identifier(pk) for pk in primary_keys]
1696
+ primary_key_sql = f", PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
1664
1697
 
1698
+ # 构建完整SQL
1699
+ sql = f"""
1700
+ CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
1701
+ {','.join(column_defs)}
1702
+ {primary_key_sql}
1703
+ ) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
1704
+ """
1665
1705
 
1666
- class OptimizeDatas:
1667
- """
1668
- 数据维护 删除 mysql 的冗余数据
1669
- 更新过程:
1670
- 1. 读取所有数据表
1671
- 2. 遍历表, 遍历列, 如果存在日期列则按天遍历所有日期, 不存在则全表读取
1672
- 3. 按天删除所有冗余数据(存在日期列时)
1673
- tips: 查找冗余数据的方式是创建一个临时迭代器, 逐行读取数据并添加到迭代器, 出现重复时将重复数据的 id 添加到临时列表, 按列表 id 执行删除
1674
- """
1675
- def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
1676
- self.username = username
1677
- self.password = password
1678
- self.host = host
1679
- self.port = port # 默认端口, 此后可能更新,不作为必传参数
1680
- self.charset = charset
1681
- self.config = {
1682
- 'host': self.host,
1683
- 'port': int(self.port),
1684
- 'user': self.username,
1685
- 'password': self.password,
1686
- 'charset': self.charset, # utf8mb4 支持存储四字节的UTF-8字符集
1687
- 'cursorclass': pymysql.cursors.DictCursor,
1688
- }
1689
- self.db_name_lists: list = [] # 更新多个数据库 删除重复数据
1690
- self.db_name = None
1691
- self.days: int = 63 # 对近 N 天的数据进行排重
1692
- self.end_date = None
1693
- self.start_date = None
1694
- self.connection = None
1706
+ try:
1707
+ with self._get_connection() as conn:
1708
+ with conn.cursor() as cursor:
1709
+ cursor.execute(sql)
1710
+ self._log_with_metrics('info', f"{db_name}.{table_name}: 数据表已创建")
1695
1711
 
1696
- @staticmethod
1697
- def try_except(func): # 在类内部定义一个异常处理方法
1712
+ # 添加普通索引
1713
+ index_statements = []
1698
1714
 
1699
- @wraps(func)
1700
- def wrapper(*args, **kwargs):
1701
- try:
1702
- return func(*args, **kwargs)
1703
- except Exception as e:
1704
- logger.error(f'{func.__name__}, {e}') # 将异常信息返回
1715
+ # 日期列索引
1716
+ if date_column and date_column in set_typ:
1717
+ safe_date_col = self._validate_identifier(date_column)
1718
+ index_statements.append(
1719
+ f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_date_col}` (`{safe_date_col}`)"
1720
+ )
1705
1721
 
1706
- return wrapper
1722
+ # 其他索引
1723
+ if indexes:
1724
+ for idx_col in indexes:
1725
+ if idx_col in set_typ:
1726
+ safe_idx_col = self._validate_identifier(idx_col)
1727
+ index_statements.append(
1728
+ f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)"
1729
+ )
1707
1730
 
1708
- def keep_connect(self, _db_name, _config, max_try: int=10):
1709
- attempts = 1
1710
- while attempts <= max_try:
1731
+ # 执行所有索引创建语句
1732
+ if index_statements:
1733
+ with conn.cursor() as cursor:
1734
+ for stmt in index_statements:
1735
+ cursor.execute(stmt)
1736
+ self._log_with_metrics('debug', f"Executed index statement: {stmt}", )
1737
+
1738
+ conn.commit()
1739
+ self._log_with_metrics('info', f"{db_name}.{table_name}: 索引已添加")
1740
+
1741
+ except Exception as e:
1742
+ self._log_with_metrics('error', f"{db_name}.{table_name}: 建表失败: {str(e)}")
1743
+ conn.rollback()
1744
+ raise
1745
+
1746
+ def _validate_datetime(self, value, date_type=False):
1747
+ """date_type: 返回字符串类型或者日期类型"""
1748
+ formats = [
1749
+ '%Y-%m-%d %H:%M:%S',
1750
+ '%Y-%m-%d',
1751
+ '%Y/%m/%d %H:%M:%S',
1752
+ '%Y/%m/%d',
1753
+ '%Y%m%d',
1754
+ '%Y-%m-%dT%H:%M:%S',
1755
+ '%Y-%m-%d %H:%M:%S.%f',
1756
+ '%Y/%-m/%-d', # 2023/1/8
1757
+ '%Y-%m-%-d', # 2023-01-8
1758
+ '%Y-%-m-%-d' # 2023-1-8
1759
+ ]
1760
+ for fmt in formats:
1711
1761
  try:
1712
- connection = pymysql.connect(**_config) # 连接数据库
1713
- return connection
1714
- except Exception as e:
1715
- logger.error(f'{_db_name}连接失败,正在重试: {self.host}:{self.port} {attempts}/{max_try} {e}')
1716
- attempts += 1
1717
- time.sleep(30)
1718
- logger.error(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
1719
- return None
1762
+ if date_type:
1763
+ return pd.to_datetime(datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d'))
1764
+ else:
1765
+ return datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d %H:%M:%S')
1766
+ except ValueError:
1767
+ continue
1768
+ raise ValueError(f"无效的日期格式2: {value}")
1720
1769
 
1721
- def optimize_list(self):
1770
+ def _validate_value(self, value: Any, column_type: str) -> Any:
1722
1771
  """
1723
- 更新多个数据库 移除冗余数据
1724
- 需要设置 self.db_name_lists
1772
+ 验证并清理数据值,根据列类型进行适当转换
1773
+
1774
+ :param value: 要验证的值
1775
+ :param column_type: 列的数据类型
1776
+ :return: 清理后的值
1777
+ :raises ValueError: 如果值转换失败
1725
1778
  """
1726
- if not self.db_name_lists:
1727
- logger.info(f'尚未设置参数: self.db_name_lists')
1728
- return
1729
- for db_name in self.db_name_lists:
1730
- self.db_name = db_name
1731
- self.optimize()
1779
+ if value is None:
1780
+ return None
1732
1781
 
1733
- def optimize(self, except_key=['更新时间']):
1734
- """ 更新一个数据库 移除冗余数据 """
1735
- if not self.db_name:
1736
- logger.info(f'尚未设置参数: self.db_name')
1737
- return
1738
- tables = self.table_list(db_name=self.db_name)
1739
- if not tables:
1740
- logger.info(f'{self.db_name} -> 数据表不存在')
1741
- return
1782
+ try:
1783
+ column_type_lower = column_type.lower()
1742
1784
 
1743
- # 日期初始化
1744
- if not self.end_date:
1745
- self.end_date = pd.to_datetime(datetime.datetime.today())
1746
- else:
1747
- self.end_date = pd.to_datetime(self.end_date)
1748
- if self.days:
1749
- self.start_date = pd.to_datetime(self.end_date - datetime.timedelta(days=self.days))
1750
- if not self.start_date:
1751
- self.start_date = self.end_date
1752
- else:
1753
- self.start_date = pd.to_datetime(self.start_date)
1754
- start_date_before = self.start_date
1755
- end_date_before = self.end_date
1785
+ if 'int' in column_type_lower:
1786
+ if isinstance(value, (str, bytes)) and not value.strip().isdigit():
1787
+ raise ValueError("非数字字符串无法转换为整数")
1788
+ return int(value)
1789
+ elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
1790
+ return float(value) if value is not None else None
1791
+ elif '日期' in column_type_lower or 'time' in column_type_lower:
1792
+ if isinstance(value, (datetime.datetime, pd.Timestamp)):
1793
+ return value.strftime('%Y-%m-%d %H:%M:%S')
1794
+ elif isinstance(value, str):
1795
+ try:
1796
+ return self._validate_datetime(value) # 使用专门的日期验证方法
1797
+ except ValueError as e:
1798
+ raise ValueError(f"无效日期格式: {value} - {str(e)}")
1799
+ return str(value)
1800
+ elif 'char' in column_type_lower or 'text' in column_type_lower:
1801
+ # 防止SQL注入
1802
+ if isinstance(value, str):
1803
+ return value.replace('\\', '\\\\').replace("'", "\\'")
1804
+ return str(value)
1805
+ elif 'json' in column_type_lower:
1806
+ import json
1807
+ return json.dumps(value) if value is not None else None
1808
+ else:
1809
+ return value
1810
+ except (ValueError, TypeError) as e:
1811
+ error_msg = f"数据类型转换异常 {value} to type {column_type}: {str(e)}"
1812
+ self._log_with_metrics('error', error_msg)
1813
+ raise ValueError(error_msg)
1756
1814
 
1757
- logger.info(f'mysql({self.host}: {self.port}) {self.db_name} 数据库优化中(日期长度: {self.days} 天)...')
1758
- for table_dict in tables:
1759
- for key, table_name in table_dict.items():
1760
- self.config.update({'database': self.db_name}) # 添加更新 config 字段
1761
- self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
1762
- if not self.connection:
1763
- return
1764
- with self.connection.cursor() as cursor:
1765
- sql = f"SELECT 1 FROM `{table_name}` LIMIT 1"
1766
- cursor.execute(sql)
1767
- result = cursor.fetchone()
1768
- if not result:
1769
- logger.info(f'数据表: {table_name}, 数据长度为 0')
1770
- continue # 检查数据表是否为空
1815
+ def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
1816
+ """获取表的列名和数据类型"""
1817
+ db_name = self._validate_identifier(db_name)
1818
+ table_name = self._validate_identifier(table_name)
1819
+ sql = """
1820
+ SELECT COLUMN_NAME, DATA_TYPE
1821
+ FROM INFORMATION_SCHEMA.COLUMNS
1822
+ WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
1823
+ ORDER BY ORDINAL_POSITION
1824
+ """
1771
1825
 
1772
- cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`") # 查询数据表的列信息
1773
- columns = cursor.fetchall()
1774
- date_exist = False
1775
- for col in columns: # 遍历列信息,检查是否存在类型为日期的列
1776
- if col['Field'] == '日期' and (col['Type'] == 'date' or col['Type'].startswith('datetime')):
1777
- date_exist = True
1778
- break
1779
- if date_exist: # 存在日期列
1780
- sql_max = f"SELECT MAX(日期) AS max_date FROM `{table_name}`"
1781
- sql_min = f"SELECT MIN(日期) AS min_date FROM `{table_name}`"
1782
- cursor.execute(sql_max)
1783
- max_result = cursor.fetchone()
1784
- cursor.execute(sql_min)
1785
- min_result = cursor.fetchone()
1786
- # 匹配修改为合适的起始和结束日期
1787
- if self.start_date < pd.to_datetime(min_result['min_date']):
1788
- self.start_date = pd.to_datetime(min_result['min_date'])
1789
- if self.end_date > pd.to_datetime(max_result['max_date']):
1790
- self.end_date = pd.to_datetime(max_result['max_date'])
1791
- dates_list = self.day_list(start_date=self.start_date, end_date=self.end_date)
1792
- # dates_list 是日期列表
1793
- for date in dates_list:
1794
- self.delete_duplicate(table_name=table_name, date=date, except_key=except_key)
1795
- self.start_date = start_date_before # 重置,不然日期错乱
1796
- self.end_date = end_date_before
1797
- else: # 不存在日期列的情况
1798
- self.delete_duplicate2(table_name=table_name, except_key=except_key)
1799
- self.connection.close()
1800
- logger.info(f'mysql({self.host}: {self.port}) {self.db_name} 数据库优化完成!')
1826
+ try:
1827
+ with self._get_connection() as conn:
1828
+ with conn.cursor() as cursor:
1829
+ cursor.execute(sql, (db_name, table_name))
1830
+ set_typ = {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
1831
+ self._log_with_metrics('debug', f"{db_name}.{table_name}: 获取表的列信息: {set_typ}")
1832
+ return set_typ
1833
+ except Exception as e:
1834
+ self._log_with_metrics('error', f"无法获取表列信息: {str(e)}")
1835
+ raise
1801
1836
 
1802
- def delete_duplicate(self, table_name, date, except_key=['更新时间']):
1803
- datas = self.table_datas(db_name=self.db_name, table_name=str(table_name), date=date)
1804
- if not datas:
1805
- return
1806
- duplicate_id = [] # 出现重复的 id
1807
- all_datas = [] # 迭代器
1808
- for data in datas:
1809
- for e_key in except_key:
1810
- if e_key in data.keys(): # 在检查重复数据时,不包含 更新时间 字段
1811
- del data[e_key]
1837
+ def _upload_to_table(
1838
+ self,
1839
+ db_name: str,
1840
+ table_name: str,
1841
+ data: List[Dict],
1842
+ set_typ: Dict[str, str],
1843
+ primary_keys: Optional[List[str]],
1844
+ check_duplicate: bool,
1845
+ duplicate_columns: Optional[List[str]],
1846
+ allow_null: bool,
1847
+ auto_create: bool,
1848
+ date_column: Optional[str],
1849
+ indexes: Optional[List[str]],
1850
+ batch_id: Optional[str] = None
1851
+ ):
1852
+ """实际执行表上传的方法"""
1853
+ # 检查表是否存在
1854
+ if not self._check_table_exists(db_name, table_name):
1855
+ if auto_create:
1856
+ self._create_table(db_name, table_name, set_typ, primary_keys, date_column, indexes,
1857
+ allow_null=allow_null)
1858
+ else:
1859
+ error_msg = f"数据表不存在: '{db_name}.{table_name}'"
1860
+ self._log_with_metrics('error', error_msg)
1861
+ raise ValueError(error_msg)
1862
+
1863
+ # 获取表结构并验证
1864
+ table_columns = self._get_table_columns(db_name, table_name)
1865
+ if not table_columns:
1866
+ error_msg = f"获取列失败 '{db_name}.{table_name}'"
1867
+ self._log_with_metrics('error', error_msg)
1868
+ raise ValueError(error_msg)
1869
+
1870
+ # 验证数据列与表列匹配
1871
+ for col in set_typ:
1872
+ if col not in table_columns:
1873
+ error_msg = f"列不存在: '{col}' -> '{db_name}.{table_name}'"
1874
+ self._log_with_metrics('error', error_msg)
1875
+ raise ValueError(error_msg)
1876
+
1877
+ # 插入数据
1878
+ self._insert_data(
1879
+ db_name, table_name, data, set_typ,
1880
+ check_duplicate, duplicate_columns
1881
+ )
1882
+
1883
+ def _infer_data_type(self, value: Any) -> str:
1884
+ """
1885
+ 根据值推断合适的数据类型
1886
+
1887
+ :param value: 要推断的值
1888
+ :return: MySQL数据类型字符串
1889
+ """
1890
+ if value is None:
1891
+ return 'VARCHAR(255)' # 默认字符串类型
1892
+
1893
+ if isinstance(value, bool):
1894
+ return 'TINYINT(1)'
1895
+ elif isinstance(value, int):
1896
+ # if -128 <= value <= 127:
1897
+ # return 'TINYINT'
1898
+ # elif -32768 <= value <= 32767:
1899
+ # return 'SMALLINT'
1900
+ # elif -8388608 <= value <= 8388607:
1901
+ # return 'MEDIUMINT'
1902
+ if -2147483648 <= value <= 2147483647:
1903
+ return 'INT'
1904
+ else:
1905
+ return 'BIGINT'
1906
+ elif isinstance(value, float):
1907
+ return 'DECIMAL(10,2)'
1908
+ elif isinstance(value, (datetime.datetime, pd.Timestamp)):
1909
+ return 'DATETIME'
1910
+ elif isinstance(value, datetime.date):
1911
+ return 'DATE'
1912
+ elif isinstance(value, (list, dict)):
1913
+ return 'JSON'
1914
+ elif isinstance(value, str):
1915
+ # 尝试判断是否是日期时间
1812
1916
  try:
1813
- delete_id = data['id']
1814
- del data['id']
1815
- data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
1816
- if data in all_datas: # 数据出现重复时
1817
- if delete_id:
1818
- duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
1819
- continue
1820
- all_datas.append(data) # 数据没有重复
1917
+ self._validate_datetime(value)
1918
+ return 'DATETIME'
1919
+ except ValueError:
1920
+ pass
1921
+
1922
+ # 根据字符串长度选择合适类型
1923
+ length = len(value)
1924
+ if length <= 255:
1925
+ return 'VARCHAR(255)'
1926
+ elif length <= 65535:
1927
+ return 'TEXT'
1928
+ elif length <= 16777215:
1929
+ return 'MEDIUMTEXT'
1930
+ else:
1931
+ return 'LONGTEXT'
1932
+ else:
1933
+ return 'VARCHAR(255)'
1934
+
1935
+ def _prepare_data(
1936
+ self,
1937
+ data: Union[Dict, List[Dict], pd.DataFrame],
1938
+ set_typ: Dict[str, str],
1939
+ allow_null: bool = False
1940
+ ) -> List[Dict]:
1941
+ """
1942
+ 准备要上传的数据,验证并转换数据类型
1943
+
1944
+ :param data: 输入数据
1945
+ :param set_typ: 列名和数据类型字典 {列名: 数据类型}
1946
+ :param allow_null: 是否允许空值
1947
+ :return: 待上传的数据列表和对应的数据类型
1948
+ :raises ValueError: 如果数据验证失败
1949
+ """
1950
+ # 统一数据格式为字典列表
1951
+ if isinstance(data, pd.DataFrame):
1952
+ try:
1953
+ # 将列名转为小写
1954
+ data.columns = [col.lower() for col in data.columns]
1955
+ data = data.replace({pd.NA: None}).to_dict('records')
1821
1956
  except Exception as e:
1822
- logger.debug(f'{table_name} 函数: mysql - > OptimizeDatas -> delete_duplicate -> {e}')
1823
- del all_datas
1957
+ self._log_with_metrics("error", f"数据转字典时发生错误: {e}", )
1958
+ raise ValueError(f"数据转字典时发生错误: {e}")
1959
+ elif isinstance(data, dict):
1960
+ data = [{k.lower(): v for k, v in data.items()}]
1961
+ elif isinstance(data, list) and all(isinstance(item, dict) for item in data):
1962
+ # 将列表中的每个字典键转为小写
1963
+ data = [{k.lower(): v for k, v in item.items()} for item in data]
1964
+ else:
1965
+ error_msg = "数据结构必须是字典、列表、字典列表或dataframe"
1966
+ self._log_with_metrics('error', error_msg)
1967
+ raise ValueError(error_msg)
1824
1968
 
1825
- if not duplicate_id: # 如果没有重复数据,则跳过该数据表
1826
- return
1969
+ # 将set_typ的键转为小写
1970
+ set_typ = {k.lower(): v for k, v in set_typ.items()}
1827
1971
 
1828
- try:
1829
- with self.connection.cursor() as cursor:
1830
- placeholders = ', '.join(['%s'] * len(duplicate_id))
1831
- # 移除冗余数据
1832
- sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
1833
- cursor.execute(sql, duplicate_id)
1834
- logger.debug(f"{table_name} -> {date.strftime('%Y-%m-%d')} before: {len(datas)}, remove: {cursor.rowcount}")
1835
- self.connection.commit() # 提交事务
1836
- except Exception as e:
1837
- logger.error(f'{self.db_name}/{table_name}, {e}')
1838
- self.connection.rollback() # 异常则回滚
1972
+ # 获取数据中实际存在的列名
1973
+ data_columns = set()
1974
+ if data:
1975
+ data_columns = set(data[0].keys())
1839
1976
 
1840
- def delete_duplicate2(self, table_name, except_key=['更新时间']):
1841
- with self.connection.cursor() as cursor:
1842
- sql = f"SELECT * FROM `{table_name}`" # 如果不包含日期列,则获取全部数据
1843
- cursor.execute(sql)
1844
- datas = cursor.fetchall()
1845
- if not datas:
1846
- return
1847
- duplicate_id = [] # 出现重复的 id
1848
- all_datas = [] # 迭代器
1849
- for data in datas:
1850
- for e_key in except_key:
1851
- if e_key in data.keys(): # 在检查重复数据时,不包含 更新时间 字段
1852
- del data[e_key]
1853
- delete_id = data['id']
1854
- del data['id']
1855
- data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
1856
- if data in all_datas: # 数据出现重复时
1857
- duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
1858
- continue
1859
- all_datas.append(data) # 数据没有重复
1860
- del all_datas
1977
+ # 过滤set_typ,只保留数据中存在的列
1978
+ filtered_set_typ = {}
1979
+ for col in data_columns:
1980
+ if col in set_typ:
1981
+ filtered_set_typ[col] = set_typ[col]
1982
+ else:
1983
+ # 如果列不在set_typ中,尝试推断类型
1984
+ sample_values = [row[col] for row in data if col in row and row[col] is not None][:10]
1985
+ if sample_values:
1986
+ inferred_type = self._infer_data_type(sample_values[0])
1987
+ filtered_set_typ[col] = inferred_type
1988
+ self._log_with_metrics('debug', f"自动推断列'{col}'的数据类型为: {inferred_type}")
1989
+ else:
1990
+ # 没有样本值,使用默认类型
1991
+ filtered_set_typ[col] = 'VARCHAR(255)'
1992
+ self._log_with_metrics('debug', f"为列'{col}'使用默认数据类型: VARCHAR(255)")
1861
1993
 
1862
- if not duplicate_id: # 如果没有重复数据,则跳过该数据表
1863
- return
1994
+ prepared_data = []
1995
+ for row_idx, row in enumerate(data, 1):
1996
+ prepared_row = {}
1997
+ for col_name in filtered_set_typ:
1998
+ # 跳过id列,不允许外部传入id
1999
+ if col_name.lower() == 'id':
2000
+ continue
1864
2001
 
1865
- try:
1866
- with self.connection.cursor() as cursor:
1867
- placeholders = ', '.join(['%s'] * len(duplicate_id))
1868
- # 移除冗余数据
1869
- sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
1870
- cursor.execute(sql, duplicate_id)
1871
- logger.info(f"{table_name} -> before: {len(datas)}, "
1872
- f"remove: {cursor.rowcount}")
1873
- self.connection.commit() # 提交事务
1874
- except Exception as e:
1875
- logger.error(f'{self.db_name}/{table_name}, {e}')
1876
- self.connection.rollback() # 异常则回滚
2002
+ if col_name not in row:
2003
+ if not allow_null:
2004
+ error_msg = f"Row {row_idx}: Missing required column '{col_name}' in data"
2005
+ self._log_with_metrics('error', error_msg)
2006
+ raise ValueError(error_msg)
2007
+ prepared_row[col_name] = None
2008
+ else:
2009
+ try:
2010
+ prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name])
2011
+ except ValueError as e:
2012
+ error_msg = f"Row {row_idx}, column '{col_name}': {str(e)}"
2013
+ self._log_with_metrics('error', error_msg)
2014
+ raise ValueError(error_msg)
2015
+ prepared_data.append(prepared_row)
1877
2016
 
1878
- def database_list(self):
1879
- """ 获取所有数据库 """
1880
- connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
1881
- if not connection:
1882
- return
1883
- with connection.cursor() as cursor:
1884
- cursor.execute("SHOW DATABASES")
1885
- databases = cursor.fetchall() # 获取所有数据库的结果
1886
- connection.close()
1887
- return databases
2017
+ self._log_with_metrics('debug', f"已准备 {len(prepared_data)} 行数据")
2018
+ return prepared_data, filtered_set_typ
2019
+
2020
+ def upload_data(
2021
+ self,
2022
+ db_name: str,
2023
+ table_name: str,
2024
+ data: Union[Dict, List[Dict], pd.DataFrame],
2025
+ set_typ: Dict[str, str],
2026
+ primary_keys: Optional[List[str]] = None,
2027
+ check_duplicate: bool = False,
2028
+ duplicate_columns: Optional[List[str]] = None,
2029
+ allow_null: bool = False,
2030
+ partition_by: Optional[str] = None,
2031
+ partition_date_column: str = '日期',
2032
+ auto_create: bool = True,
2033
+ indexes: Optional[List[str]] = None
2034
+ ):
2035
+ """
2036
+ 上传数据到数据库
2037
+ """
2038
+ upload_start = time.time()
2039
+ self._record_metrics('total_uploads', 1)
2040
+ initial_row_count = len(data) if hasattr(data, '__len__') else 1
2041
+ self.metrics['total_rows'] += len(data) if hasattr(data, '__len__') else 1
2042
+
2043
+ batch_id = f"batch_{int(time.time() * 1000)}"
2044
+ success_flag = False
2045
+
2046
+ self._log_with_metrics('info', "开始上传数据", {
2047
+ 'batch_id': batch_id,
2048
+ 'database': db_name,
2049
+ 'table': table_name,
2050
+ 'partition_by': partition_by,
2051
+ 'check_duplicate': check_duplicate,
2052
+ 'row_count': len(data) if hasattr(data, '__len__') else 1,
2053
+ 'auto_create': auto_create
2054
+ })
1888
2055
 
1889
- def table_list(self, db_name):
1890
- """ 获取指定数据库的所有数据表 """
1891
- connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
1892
- if not connection:
1893
- return
1894
2056
  try:
1895
- with connection.cursor() as cursor:
1896
- cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
1897
- database_exists = cursor.fetchone()
1898
- if not database_exists:
1899
- logger.info(f'{db_name}: 数据表不存在!')
1900
- return
2057
+ # 验证参数
2058
+ if not set_typ:
2059
+ error_msg = "列的数据类型缺失"
2060
+ self._log_with_metrics('error', error_msg)
2061
+ raise ValueError(error_msg)
2062
+
2063
+ if partition_by and partition_by not in ['year', 'month']:
2064
+ error_msg = "分表方式必须是 'year' 或 'month'"
2065
+ self._log_with_metrics('error', error_msg)
2066
+ raise ValueError(error_msg)
2067
+
2068
+ # 准备数据
2069
+ prepared_data, set_typ = self._prepare_data(data, set_typ, allow_null)
2070
+
2071
+ # 检查数据库是否存在
2072
+ if not self._check_database_exists(db_name):
2073
+ if auto_create:
2074
+ self._create_database(db_name)
2075
+ else:
2076
+ error_msg = f"数据库不存在: '{db_name}'"
2077
+ self._log_with_metrics('error', error_msg)
2078
+ raise ValueError(error_msg)
2079
+
2080
+ # 处理分表逻辑
2081
+ if partition_by:
2082
+ partitioned_data = {}
2083
+ for row in prepared_data:
2084
+ try:
2085
+ if partition_date_column not in row:
2086
+ error_msg = f"异常缺失列 '{partition_date_column}'"
2087
+ self._log_with_metrics('error', error_msg)
2088
+ continue # 跳过当前行
2089
+
2090
+ part_table = self._get_partition_table_name(
2091
+ table_name,
2092
+ str(row[partition_date_column]),
2093
+ partition_by
2094
+ )
2095
+ if part_table not in partitioned_data:
2096
+ partitioned_data[part_table] = []
2097
+ partitioned_data[part_table].append(row)
2098
+ except Exception as e:
2099
+ self._log_with_metrics('error', "分表处理失败", {
2100
+ 'row_data': row,
2101
+ 'error': str(e)
2102
+ })
2103
+ continue # 跳过当前行
2104
+
2105
+ # 对每个分表执行上传
2106
+ for part_table, part_data in partitioned_data.items():
2107
+ try:
2108
+ self._upload_to_table(
2109
+ db_name, part_table, part_data, set_typ,
2110
+ primary_keys, check_duplicate, duplicate_columns,
2111
+ allow_null, auto_create, partition_date_column,
2112
+ indexes, batch_id
2113
+ )
2114
+ except Exception as e:
2115
+ self._log_with_metrics('error', "分表上传失败", {
2116
+ 'partition_table': part_table,
2117
+ 'error': str(e)
2118
+ })
2119
+ continue # 跳过当前分表,继续处理其他分表
2120
+ else:
2121
+ # 不分表,直接上传
2122
+ self._upload_to_table(
2123
+ db_name, table_name, prepared_data, set_typ,
2124
+ primary_keys, check_duplicate, duplicate_columns,
2125
+ allow_null, auto_create, partition_date_column,
2126
+ indexes, batch_id
2127
+ )
2128
+
2129
+ success_flag = True
2130
+
1901
2131
  except Exception as e:
1902
- logger.error(f'002 {e}')
1903
- return
2132
+ self._log_with_metrics('error', "上传过程中发生全局错误", {
2133
+ 'error': str(e),
2134
+ 'error_type': type(e).__name__
2135
+ })
1904
2136
  finally:
1905
- connection.close() # 断开连接
2137
+ elapsed = time.time() - upload_start
2138
+ self._record_metrics('upload_execution_time', elapsed, is_timing=True)
1906
2139
 
1907
- self.config.update({'database': db_name}) # 添加更新 config 字段
1908
- connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
1909
- if not connection:
1910
- return
1911
- with connection.cursor() as cursor:
1912
- cursor.execute("SHOW TABLES")
1913
- tables = cursor.fetchall() # 获取所有数据表
1914
- connection.close()
1915
- return tables
2140
+ if success_flag:
2141
+ self._record_metrics('successful_uploads', 1)
2142
+ else:
2143
+ self._record_metrics('failed_uploads', 1)
1916
2144
 
1917
- def table_datas(self, db_name, table_name, date):
2145
+ self._log_with_metrics('info', "上传处理完成", {
2146
+ 'batch_id': batch_id,
2147
+ 'success': success_flag,
2148
+ 'time_elapsed': elapsed,
2149
+ 'initial_row_count': initial_row_count,
2150
+ 'processed_rows': self.metrics['successful_rows'] + self.metrics['failed_rows']
2151
+ })
2152
+
2153
+ def _insert_data(
2154
+ self,
2155
+ db_name: str,
2156
+ table_name: str,
2157
+ data: List[Dict],
2158
+ set_typ: Dict[str, str],
2159
+ check_duplicate: bool = False,
2160
+ duplicate_columns: Optional[List[str]] = None,
2161
+ batch_size: int = 1000,
2162
+ batch_id: Optional[str] = None
2163
+ ):
1918
2164
  """
1919
- 获取指定数据表的数据, 按天获取
2165
+ 插入数据到表中
2166
+
2167
+ 参数:
2168
+ db_name: 数据库名
2169
+ table_name: 表名
2170
+ data: 要插入的数据列表
2171
+ set_typ: 列名和数据类型字典 {列名: 数据类型}
2172
+ check_duplicate: 是否检查重复
2173
+ duplicate_columns: 用于检查重复的列(为空时检查所有列)
2174
+ batch_size: 批量插入大小
2175
+ batch_id: 批次ID用于日志追踪
1920
2176
  """
1921
- self.config.update({'database': db_name}) # 添加更新 config 字段
1922
- connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
1923
- if not connection:
2177
+ if not data:
1924
2178
  return
2179
+
2180
+ # 获取所有列名(排除id列)
2181
+ all_columns = [col for col in set_typ.keys() if col.lower() != 'id']
2182
+ safe_columns = [self._validate_identifier(col) for col in all_columns]
2183
+ placeholders = ','.join(['%s'] * len(safe_columns))
2184
+
2185
+ # 构建基础SQL语句
2186
+ if check_duplicate:
2187
+ if not duplicate_columns:
2188
+ duplicate_columns = all_columns
2189
+ else:
2190
+ duplicate_columns = [col for col in duplicate_columns if col != 'id']
2191
+
2192
+ conditions = []
2193
+ for col in duplicate_columns:
2194
+ col_type = set_typ.get(col, '').lower()
2195
+
2196
+ # 处理DECIMAL类型,使用ROUND确保精度一致
2197
+ if col_type.startswith('decimal'):
2198
+ # 提取小数位数,如DECIMAL(10,2)提取2
2199
+ scale_match = re.search(r'decimal\(\d+,(\d+)\)', col_type)
2200
+ scale = int(scale_match.group(1)) if scale_match else 2
2201
+ conditions.append(f"ROUND(`{self._validate_identifier(col)}`, {scale}) = ROUND(%s, {scale})")
2202
+ else:
2203
+ conditions.append(f"`{self._validate_identifier(col)}` = %s")
2204
+
2205
+ where_clause = " AND ".join(conditions)
2206
+
2207
+ sql = f"""
2208
+ INSERT INTO `{db_name}`.`{table_name}`
2209
+ (`{'`,`'.join(safe_columns)}`)
2210
+ SELECT {placeholders}
2211
+ FROM DUAL
2212
+ WHERE NOT EXISTS (
2213
+ SELECT 1 FROM `{db_name}`.`{table_name}`
2214
+ WHERE {where_clause}
2215
+ )
2216
+ """
2217
+ else:
2218
+ sql = f"""
2219
+ INSERT INTO `{db_name}`.`{table_name}`
2220
+ (`{'`,`'.join(safe_columns)}`)
2221
+ VALUES ({placeholders})
2222
+ """
2223
+
2224
+ total_inserted = 0
2225
+ total_skipped = 0
2226
+ total_failed = 0 # 失败计数器
2227
+
2228
+ # 分批插入数据
2229
+ with self._get_connection() as conn:
2230
+ with conn.cursor() as cursor:
2231
+ for i in range(0, len(data), batch_size):
2232
+ batch_start = time.time()
2233
+ batch = data[i:i + batch_size]
2234
+ successful_rows = 0 # 当前批次成功数
2235
+
2236
+ for row in batch:
2237
+ try:
2238
+ # 准备参数
2239
+ row_values = [row.get(col) for col in all_columns]
2240
+ # 如果是排重检查,添加排重列值
2241
+ if check_duplicate:
2242
+ row_values += [row.get(col) for col in duplicate_columns]
2243
+
2244
+ cursor.execute(sql, row_values)
2245
+ successful_rows += 1
2246
+ conn.commit() # 每次成功插入后提交
2247
+
2248
+ except Exception as e:
2249
+ conn.rollback() # 回滚当前行的事务
2250
+ total_failed += 1
2251
+
2252
+ # 记录失败行详细信息
2253
+ error_details = {
2254
+ 'batch_id': batch_id,
2255
+ 'database': db_name,
2256
+ 'table': table_name,
2257
+ 'error_type': type(e).__name__,
2258
+ 'error_message': str(e),
2259
+ 'column_types': set_typ,
2260
+ 'duplicate_check': check_duplicate,
2261
+ 'duplicate_columns': duplicate_columns
2262
+ }
2263
+ self._log_with_metrics('error', f"单行插入失败: {error_details}")
2264
+ continue # 跳过当前行,继续处理下一行
2265
+
2266
+ # 更新统计信息
2267
+ if check_duplicate:
2268
+ cursor.execute("SELECT ROW_COUNT()")
2269
+ affected_rows = cursor.rowcount
2270
+ total_inserted += affected_rows
2271
+ total_skipped += len(batch) - affected_rows - (len(batch) - successful_rows)
2272
+ else:
2273
+ total_inserted += successful_rows
2274
+
2275
+ batch_elapsed = time.time() - batch_start
2276
+ self._record_metrics('batch_execution_time', batch_elapsed, is_timing=True)
2277
+
2278
+ batch_info = {
2279
+ 'batch_id': batch_id,
2280
+ 'batch_index': i // batch_size + 1,
2281
+ 'total_batches': (len(data) + batch_size - 1) // batch_size,
2282
+ 'batch_size': len(batch),
2283
+ 'successful_rows': successful_rows,
2284
+ 'failed_rows': len(batch) - successful_rows,
2285
+ 'time_elapsed': batch_elapsed,
2286
+ 'rows_per_second': successful_rows / batch_elapsed if batch_elapsed > 0 else 0
2287
+ }
2288
+ self._log_with_metrics('debug', f"批次处理完成 {batch_info}")
2289
+
2290
+ # 更新全局指标
2291
+ self.metrics['failed_rows'] += total_failed
2292
+ self._log_with_metrics('info', "数据插入完成", {
2293
+ 'total_rows': len(data),
2294
+ 'inserted_rows': total_inserted,
2295
+ 'skipped_rows': total_skipped,
2296
+ 'failed_rows': total_failed
2297
+ })
2298
+
2299
+ def get_metrics(self) -> Dict:
2300
+ """获取当前性能指标"""
2301
+ metrics = self.metrics.copy()
2302
+
2303
+ # 添加当前系统指标
2304
+ metrics.update({
2305
+ 'current_time': datetime.datetime.now().isoformat(),
2306
+ 'system': self._get_system_metrics(),
2307
+ 'connection_pool': {
2308
+ 'size': self.pool_size,
2309
+ 'active': len(self.pool._connections) if hasattr(self.pool, '_connections') else 0
2310
+ }
2311
+ })
2312
+
2313
+ return metrics
2314
+
2315
+ def close(self):
2316
+ """关闭连接池并记录最终指标"""
2317
+ close_start = time.time()
2318
+
1925
2319
  try:
1926
- with connection.cursor() as cursor:
1927
- sql = f"SELECT * FROM `{table_name}` WHERE {'日期'} BETWEEN '%s' AND '%s'" % (date, date)
1928
- cursor.execute(sql)
1929
- results = cursor.fetchall()
2320
+ if hasattr(self, 'pool') and self.pool is not None:
2321
+ # 记录关闭前的连接池状态
2322
+ active_connections = self._get_system_metrics().get('connections', 0)
2323
+
2324
+ # 更安全的关闭方式
2325
+ try:
2326
+ self.pool.close()
2327
+ except Exception as e:
2328
+ self._log_with_metrics('warning', "关闭连接池时出错", {
2329
+ 'error': str(e)
2330
+ })
2331
+
2332
+ self.pool = None
2333
+
2334
+ elapsed = time.time() - close_start
2335
+ self._log_with_metrics('info', "连接池已关闭", {
2336
+ 'active_connections_before_close': active_connections,
2337
+ 'close_time_elapsed': elapsed
2338
+ })
1930
2339
  except Exception as e:
1931
- logger.error(f'001 {e}')
2340
+ elapsed = time.time() - close_start
2341
+ self._log_with_metrics('error', "关闭连接池失败", {
2342
+ 'error': str(e),
2343
+ 'close_time_elapsed': elapsed
2344
+ })
2345
+ raise
1932
2346
  finally:
1933
- connection.close()
1934
- return results
1935
-
1936
- def day_list(self, start_date, end_date):
1937
- start_date = pd.to_datetime(start_date)
1938
- end_date = pd.to_datetime(end_date)
1939
- date_list = []
1940
- while start_date <= end_date:
1941
- date_list.append(pd.to_datetime(start_date.date()))
1942
- start_date += datetime.timedelta(days=1)
1943
- return date_list
2347
+ # 记录最终性能指标
2348
+ if hasattr(self, 'logger') and self.logger and self.enable_metrics:
2349
+ self._log_with_metrics('debug', "最终性能指标", self.get_metrics())
1944
2350
 
1945
- def rename_column(self):
1946
- """ 批量修改数据库的列名 """
1947
- """
1948
- # for db_name in ['京东数据2', '推广数据2', '市场数据2', '生意参谋2', '生意经2', '属性设置2',]:
1949
- # s = OptimizeDatas(username=username, password=password, host=host, port=port)
1950
- # s.db_name = db_name
1951
- # s.rename_column()
1952
- """
1953
- tables = self.table_list(db_name=self.db_name)
1954
- for table_dict in tables:
1955
- for key, table_name in table_dict.items():
1956
- self.config.update({'database': self.db_name}) # 添加更新 config 字段
1957
- self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
1958
- if not self.connection:
1959
- return
1960
- with self.connection.cursor() as cursor:
1961
- cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`") # 查询数据表的列信息
1962
- columns = cursor.fetchall()
1963
- columns = [{column['Field']: column['Type']} for column in columns]
1964
- for column in columns:
1965
- for key, value in column.items():
1966
- if key.endswith('_'):
1967
- new_name = re.sub(r'_+$', '', key)
1968
- sql = f"ALTER TABLE `{table_name}` CHANGE COLUMN {key} {new_name} {value}"
1969
- cursor.execute(sql)
1970
- self.connection.commit()
1971
- if self.connection:
1972
- self.connection.close()
2351
+ def _check_pool_health(self):
2352
+ """定期检查连接池健康状态"""
2353
+ try:
2354
+ conn = self.pool.connection()
2355
+ conn.ping(reconnect=True)
2356
+ conn.close()
2357
+ return True
2358
+ except Exception:
2359
+ self._log_with_metrics('warning', "连接池健康检查失败", {
2360
+ 'error': str(e)
2361
+ })
2362
+ return False
2363
+
2364
+ def retry_on_failure(max_retries=3, delay=1):
2365
+ def decorator(func):
2366
+ @wraps(func)
2367
+ def wrapper(*args, **kwargs):
2368
+ last_exception = None
2369
+ for attempt in range(max_retries):
2370
+ try:
2371
+ return func(*args, **kwargs)
2372
+ except (pymysql.OperationalError, pymysql.InterfaceError) as e:
2373
+ last_exception = e
2374
+ if attempt < max_retries - 1:
2375
+ time.sleep(delay * (attempt + 1))
2376
+ continue
2377
+ raise MySQLUploaderError(f"操作重试{max_retries}次后失败") from e
2378
+ except Exception as e:
2379
+ raise MySQLUploaderError(f"操作失败: {str(e)}") from e
2380
+ raise last_exception if last_exception else MySQLUploaderError("未知错误")
1973
2381
 
2382
+ return wrapper
1974
2383
 
1975
- if __name__ == '__main__':
1976
- pass
2384
+ return decorator
1977
2385
 
1978
- # 初始化上传器
2386
+ def main():
1979
2387
  uploader = MySQLUploader(
1980
2388
  username='root',
1981
2389
  password='1',
1982
2390
  host='localhost',
1983
2391
  port=3306,
1984
- enable_logging=True,
1985
- log_level='INFO'
2392
+ logging_mode='console',
2393
+ log_level='info'
1986
2394
  )
1987
2395
 
1988
2396
  # 定义列和数据类型
1989
- columns = {
1990
- 'id': 'INT',
2397
+ set_typ = {
1991
2398
  'name': 'VARCHAR(255)',
1992
2399
  'age': 'INT',
1993
2400
  'salary': 'DECIMAL(10,2)',
1994
- '日期': 'DATE'
2401
+ '日期': 'DATE',
2402
+ 'shop': None,
1995
2403
  }
1996
2404
 
1997
2405
  # 准备数据
1998
2406
  data = [
1999
- {'name': 'Alice', 'age': 30, 'salary': 50000.50, '日期': '2023-01-15'},
2000
- {'name': 'Bob', 'age': 25, 'salary': 45000.75, '日期': '2023-02-20'},
2001
- {'name': 'Charlie', 'age': 35, 'salary': 60000.00, '日期': '2023-01-10'}
2407
+ {'日期': '2023-01-8', 'name': 'JACk', 'AGE': '24', 'salary': 555.1545},
2408
+ {'日期': '2023-01-15', 'name': 'Alice', 'AGE': 35, 'salary': 100},
2409
+ {'日期': '2023-01-15', 'name': 'Alice', 'AGE': 30, 'salary': 0.0},
2410
+ {'日期': '2023-02-20', 'name': 'Bob', 'AGE': 25, 'salary': 45000.75}
2002
2411
  ]
2003
2412
 
2004
2413
  # 上传数据
2005
2414
  uploader.upload_data(
2006
- db_name='test_db',
2007
- table_name='employees',
2415
+ db_name='测试库',
2416
+ table_name='测试表',
2008
2417
  data=data,
2009
- columns=columns,
2010
- primary_keys=[],
2011
- check_duplicate=True,
2012
- replace=True,
2013
- duplicate_columns=['name'],
2014
- allow_null=False,
2015
- partition_by='month' # 按月分表
2418
+ set_typ=set_typ, # 定义列和数据类型
2419
+ primary_keys=[], # 创建唯一主键
2420
+ check_duplicate=True, # 检查重复数据
2421
+ duplicate_columns=[], # 指定排重的组合键
2422
+ allow_null=False, # 允许插入空值
2423
+ partition_by='year', # 按月分表
2424
+ partition_date_column='日期', # 用于分表的日期列名,默认为'日期'
2425
+ auto_create=True, # 表不存在时自动创建, 默认参数不要更改
2426
+ indexes=[], # 指定索引列
2016
2427
  )
2017
2428
 
2018
- # 关闭上传器
2019
2429
  uploader.close()
2430
+
2431
+
2432
+ if __name__ == '__main__':
2433
+ pass