mdbq 3.9.0__py3-none-any.whl → 3.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/mysql/mysql.py CHANGED
@@ -10,10 +10,13 @@ import pandas as pd
10
10
  from sqlalchemy import create_engine
11
11
  import os
12
12
  import logging
13
+ import logging.handlers
13
14
  from mdbq.other import otk
14
-
15
- from dbutils.pooled_db import PooledDB
16
15
  from typing import Union, List, Dict, Optional, Any, Tuple
16
+ from dbutils.pooled_db import PooledDB
17
+ import json
18
+ import psutil # 用于监控资源使用情况
19
+
17
20
 
18
21
  warnings.filterwarnings('ignore')
19
22
  """
@@ -46,565 +49,427 @@ def count_decimal_places(num_str):
46
49
  return 0, 0
47
50
 
48
51
 
49
- class MySQLUploader:
50
- def __init__(
51
- self,
52
- username: str,
53
- password: str,
54
- host: str = 'localhost',
55
- port: int = 3306,
56
- charset: str = 'utf8mb4',
57
- collation: str = 'utf8mb4_0900_ai_ci',
58
- enable_logging: bool = False,
59
- log_level: str = 'ERROR',
60
- max_retries: int = 10,
61
- retry_interval: int = 10,
62
- pool_size: int = 5
63
- ):
64
- """
65
- 初始化MySQL上传工具
66
-
67
- :param username: 数据库用户名
68
- :param password: 数据库密码
69
- :param host: 数据库主机地址,默认为localhost
70
- :param port: 数据库端口,默认为3306
71
- :param charset: 字符集,默认为utf8mb4
72
- :param collation: 排序规则,默认为utf8mb4_0900_ai_ci
73
- :param enable_logging: 是否启用日志,默认为False
74
- :param log_level: 日志级别,默认为ERROR
75
- :param max_retries: 最大重试次数,默认为10
76
- :param retry_interval: 重试间隔(秒),默认为10
77
- :param pool_size: 连接池大小,默认为5
78
- """
52
+ class MysqlUpload:
53
+ def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
79
54
  self.username = username
80
55
  self.password = password
81
56
  self.host = host
82
57
  self.port = port
83
- self.charset = charset
84
- self.collation = collation
85
- self.max_retries = max_retries
86
- self.retry_interval = retry_interval
87
- self.pool_size = pool_size
88
-
89
- # 初始化日志
90
- if enable_logging:
91
- self._init_logging(log_level)
92
-
93
- # 创建连接池
94
- self.pool = self._create_connection_pool()
95
-
96
- def _init_logging(self, log_level: str):
97
- """初始化日志配置"""
98
- logging.basicConfig(
99
- level=getattr(logging, log_level.upper(), logging.ERROR),
100
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
101
- )
102
- self.logger = logging.getLogger('MySQLUploader')
103
-
104
- def _create_connection_pool(self):
105
- """创建数据库连接池"""
106
- return PooledDB(
107
- creator=pymysql,
108
- host=self.host,
109
- port=self.port,
110
- user=self.username,
111
- password=self.password,
112
- charset=self.charset,
113
- maxconnections=self.pool_size,
114
- cursorclass=pymysql.cursors.DictCursor
115
- )
116
-
117
- def _validate_identifier(self, identifier: str) -> str:
118
- """
119
- 验证并清理数据库标识符(数据库名、表名、列名)
120
- 防止SQL注入和非法字符
58
+ if username == '' or password == '' or host == '' or port == 0:
59
+ self.config = None
60
+ else:
61
+ self.config = {
62
+ 'host': self.host,
63
+ 'port': int(self.port),
64
+ 'user': self.username,
65
+ 'password': self.password,
66
+ 'charset': charset, # utf8mb4 支持存储四字节的UTF-8字符集
67
+ 'cursorclass': pymysql.cursors.DictCursor,
68
+ }
69
+ self.filename = None
121
70
 
122
- :param identifier: 要验证的标识符
123
- :return: 清理后的安全标识符
124
- """
125
- if not identifier or not isinstance(identifier, str):
126
- raise ValueError(f"Invalid identifier: {identifier}")
71
+ @staticmethod
72
+ def try_except(func): # 在类内部定义一个异常处理方法
127
73
 
128
- # 移除可能有害的字符
129
- cleaned = re.sub(r'[^a-zA-Z0-9_$]', '', identifier)
130
- if not cleaned:
131
- raise ValueError(f"Invalid identifier after cleaning: {identifier}")
74
+ @wraps(func)
75
+ def wrapper(*args, **kwargs):
76
+ try:
77
+ return func(*args, **kwargs)
78
+ except Exception as e:
79
+ logger.error(f'{func.__name__}, {e}') # 将异常信息返回
132
80
 
133
- return cleaned
81
+ return wrapper
134
82
 
135
- def _validate_value(self, value: Any, column_type: str) -> Any:
136
- """
137
- 验证并清理数据值,根据列类型进行适当转换
83
+ def keep_connect(self, _db_name, _config, max_try: int=10):
84
+ attempts = 1
85
+ while attempts <= max_try:
86
+ try:
87
+ connection = pymysql.connect(**_config) # 连接数据库
88
+ return connection
89
+ except Exception as e:
90
+ logger.error(f'{_db_name}: 连接失败,正在重试: {self.host}:{self.port} {attempts}/{max_try} {e}')
91
+ attempts += 1
92
+ time.sleep(30)
93
+ logger.error(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
94
+ return None
138
95
 
139
- :param value: 要验证的值
140
- :param column_type: 列的数据类型
141
- :return: 清理后的值
142
- """
143
- if value is None:
144
- return None
96
+ def cover_doc_dtypes(self, dict_data):
97
+ """ 清理字典键值 并转换数据类型 """
98
+ if not dict_data:
99
+ logger.info(f'mysql.py -> MysqlUpload -> cover_dict_dtypes -> 传入的字典不能为空')
100
+ return
101
+ __res_dict = {}
102
+ new_dict_data = {}
103
+ for k, v in dict_data.items():
104
+ k = str(k).lower()
105
+ k = re.sub(r'[()\-,,$&~^、 ()\"\'“”=·/。》《><!!`]', '_', k, re.IGNORECASE)
106
+ k = k.replace(')', '')
107
+ k = re.sub(r'_{2,}', '_', k)
108
+ k = re.sub(r'_+$', '', k)
109
+ result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
110
+ result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
111
+ result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
112
+ result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
145
113
 
146
- try:
147
- if 'int' in column_type.lower():
148
- return int(value) if value is not None else None
149
- elif 'float' in column_type.lower() or 'double' in column_type.lower() or 'decimal' in column_type.lower():
150
- return float(value) if value is not None else None
151
- elif 'date' in column_type.lower() or 'time' in column_type.lower():
152
- if isinstance(value, (datetime.datetime, pd.Timestamp)):
153
- return value.strftime('%Y-%m-%d %H:%M:%S')
154
- return str(value)
155
- elif 'char' in column_type.lower() or 'text' in column_type.lower():
156
- return str(value)
114
+ date_type = otk.is_valid_date(v) # 判断日期时间
115
+ int_num = otk.is_integer(v) # 判断整数
116
+ count_int, count_float = count_decimal_places(v) # 判断小数,返回小数位数
117
+ if result1: # 京东sku/spu商品信息
118
+ __res_dict.update({k: 'varchar(100)'})
119
+ elif k == '日期':
120
+ __res_dict.update({k: 'DATE'})
121
+ elif k == '更新时间':
122
+ __res_dict.update({k: 'TIMESTAMP'})
123
+ elif result2: # 小数
124
+ __res_dict.update({k: 'decimal(10,4)'})
125
+ elif date_type == 1: # 纯日期
126
+ __res_dict.update({k: 'DATE'})
127
+ elif date_type == 2: # 日期+时间
128
+ __res_dict.update({k: 'DATETIME'})
129
+ elif int_num:
130
+ __res_dict.update({k: 'INT'})
131
+ elif count_float > 0:
132
+ if count_int + count_float > 10:
133
+ if count_float >= 6:
134
+ __res_dict.update({k: 'decimal(14,6)'})
135
+ else:
136
+ __res_dict.update({k: 'decimal(14,4)'})
137
+ elif count_float >= 6:
138
+ __res_dict.update({k: 'decimal(14,6)'})
139
+ elif count_float >= 4:
140
+ __res_dict.update({k: 'decimal(12,4)'})
141
+ else:
142
+ __res_dict.update({k: 'decimal(10,2)'})
157
143
  else:
158
- return value
159
- except (ValueError, TypeError) as e:
160
- raise ValueError(f"Failed to convert value {value} to type {column_type}: {str(e)}")
144
+ __res_dict.update({k: 'varchar(255)'})
145
+ new_dict_data.update({k: v})
146
+ __res_dict.update({'数据主体': 'longblob'})
147
+ return __res_dict, new_dict_data
161
148
 
162
- def _execute_with_retry(self, func, *args, **kwargs):
149
+ @try_except
150
+ def insert_many_dict(self, db_name, table_name, dict_data_list, icm_update=None, index_length=100, set_typ=None, allow_not_null=False, cut_data=None):
163
151
  """
164
- 带重试机制的SQL执行装饰器
165
-
166
- :param func: 要执行的函数
167
- :param args: 位置参数
168
- :param kwargs: 关键字参数
169
- :return: 函数执行结果
152
+ 插入字典数据
153
+ dict_data: 字典
154
+ index_length: 索引长度
155
+ icm_update: 增量更正
156
+ set_typ: {}
157
+ allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
170
158
  """
159
+ if not self.config:
160
+ return
171
161
 
172
- @wraps(func)
173
- def wrapper(*args, **kwargs):
174
- last_exception = None
175
- for attempt in range(self.max_retries):
162
+ if not dict_data_list:
163
+ logger.info(f'dict_data_list 不能为空 ')
164
+ return
165
+ dict_data = dict_data_list[0]
166
+ if cut_data:
167
+ if '日期' in dict_data.keys():
176
168
  try:
177
- return func(*args, **kwargs)
178
- except pymysql.OperationalError as e:
179
- last_exception = e
180
- if attempt < self.max_retries - 1:
181
- time.sleep(self.retry_interval)
182
- # 尝试重新连接
183
- self.pool = self._create_connection_pool()
184
- continue
185
- raise last_exception if last_exception else Exception("Unknown error occurred")
169
+ __y = pd.to_datetime(dict_data['日期']).strftime('%Y')
170
+ __y_m = pd.to_datetime(dict_data['日期']).strftime('%Y-%m')
171
+ if str(cut_data).lower() == 'year':
172
+ table_name = f'{table_name}_{__y}'
173
+ elif str(cut_data).lower() == 'month':
174
+ table_name = f'{table_name}_{__y_m}'
175
+ else:
176
+ logger.info(f'参数不正确,cut_data应为 year 或 month ')
177
+ except Exception as e:
178
+ logger.error(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
186
179
 
187
- return wrapper(*args, **kwargs)
180
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
181
+ if not connection:
182
+ return
183
+ with connection.cursor() as cursor:
184
+ cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
185
+ database_exists = cursor.fetchone()
186
+ if not database_exists:
187
+ # 如果数据库不存在,则新建
188
+ sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
189
+ cursor.execute(sql)
190
+ connection.commit()
191
+ logger.info(f"创建Database: {db_name}")
188
192
 
189
- def _get_connection(self):
190
- """从连接池获取连接"""
191
- return self.pool.connection()
193
+ self.config.update({'database': db_name}) # 添加更新 config 字段
194
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
195
+ if not connection:
196
+ return
197
+ with connection.cursor() as cursor:
198
+ # 1. 查询表, 不存在则创建一个空表
199
+ sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
200
+ cursor.execute(sql, (table_name,))
201
+ if not cursor.fetchone():
202
+ sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
203
+ cursor.execute(sql)
204
+ logger.info(f'创建 mysql 表: {table_name}')
192
205
 
193
- def _check_database_exists(self, db_name: str) -> bool:
194
- """检查数据库是否存在"""
195
- db_name = self._validate_identifier(db_name)
196
- sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
206
+ # 根据 dict_data 的值添加指定的数据类型
207
+ dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
208
+ if set_typ:
209
+ # 更新自定义的列数据类型
210
+ for k, v in dtypes.copy().items():
211
+ # 确保传进来的 set_typ 键存在于实际的 df 列才 update
212
+ [dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
197
213
 
198
- with self._get_connection() as conn:
199
- with conn.cursor() as cursor:
200
- cursor.execute(sql, (db_name,))
201
- return bool(cursor.fetchone())
214
+ # 检查列
215
+ sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
216
+ cursor.execute(sql, (db_name, table_name))
217
+ col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()] # 已存在的所有列
218
+ col_not_exist = [col for col in dict_data.keys() if col not in col_exist] # 不存在的列
219
+ # 不存在则新建列
220
+ if col_not_exist: # 数据表中不存在的列
221
+ for col in col_not_exist:
222
+ # 创建列,需转义
223
+ if allow_not_null:
224
+ sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]};"
225
+ else:
226
+ sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
202
227
 
203
- def _create_database(self, db_name: str):
204
- """创建数据库"""
205
- db_name = self._validate_identifier(db_name)
206
- sql = f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET {self.charset} COLLATE {self.collation}"
228
+ cursor.execute(sql)
229
+ logger.info(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
207
230
 
208
- with self._get_connection() as conn:
209
- with conn.cursor() as cursor:
210
- cursor.execute(sql)
211
- conn.commit()
231
+ if col == '日期':
232
+ sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
233
+ logger.info(f"设置为索引: {col}({dtypes[col]})")
234
+ cursor.execute(sql)
212
235
 
213
- def _check_table_exists(self, db_name: str, table_name: str) -> bool:
214
- """检查表是否存在"""
215
- db_name = self._validate_identifier(db_name)
216
- table_name = self._validate_identifier(table_name)
217
- sql = """
218
- SELECT TABLE_NAME
219
- FROM INFORMATION_SCHEMA.TABLES
220
- WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
221
- """
236
+ connection.commit() # 提交事务
237
+ """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
238
+ """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
239
+ # 处理插入的数据
240
+ for dict_data in dict_data_list:
241
+ dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
242
+ if icm_update:
243
+ """ 使用增量更新: 需确保 icm_update['主键'] 传进来的列组合是数据表中唯一,值不会发生变化且不会重复,否则可能产生覆盖 """
244
+ sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
245
+ cursor.execute(sql, (db_name, table_name))
246
+ columns = cursor.fetchall()
247
+ cols_exist = [col['COLUMN_NAME'] for col in columns] # 数据表的所有列, 返回 list
248
+ # 保留原始列名,不提前转义
249
+ raw_update_col = [item for item in cols_exist if item not in icm_update and item != 'id'] # 除了主键外的其他列
222
250
 
223
- with self._get_connection() as conn:
224
- with conn.cursor() as cursor:
225
- cursor.execute(sql, (db_name, table_name))
226
- return bool(cursor.fetchone())
251
+ # 构建条件参数(使用原始列名)
252
+ condition_params = []
253
+ condition_parts = []
254
+ for up_col in icm_update:
255
+ condition_parts.append(f"`{up_col}` = %s") # SQL 转义
256
+ condition_params.append(dict_data[up_col]) # 原始列名用于访问数据
227
257
 
228
- def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
229
- """获取表的列名和数据类型"""
230
- db_name = self._validate_identifier(db_name)
231
- table_name = self._validate_identifier(table_name)
232
- sql = """
233
- SELECT COLUMN_NAME, DATA_TYPE
234
- FROM INFORMATION_SCHEMA.COLUMNS
235
- WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
236
- """
258
+ # 动态转义列名生成 SQL 查询字段
259
+ escaped_update_col = [f'`{col}`' for col in raw_update_col]
260
+ sql = f"""SELECT {','.join(escaped_update_col)} FROM `{table_name}` WHERE {' AND '.join(condition_parts)}"""
261
+ cursor.execute(sql, condition_params)
262
+ results = cursor.fetchall()
237
263
 
238
- with self._get_connection() as conn:
239
- with conn.cursor() as cursor:
240
- cursor.execute(sql, (db_name, table_name))
241
- return {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
264
+ if results:
265
+ for result in results:
266
+ change_col = []
267
+ change_placeholders = []
268
+ set_params = []
269
+ for raw_col in raw_update_col:
270
+ # 使用原始列名访问数据
271
+ df_value = str(dict_data[raw_col])
272
+ mysql_value = str(result[raw_col])
242
273
 
243
- def _create_table(
244
- self,
245
- db_name: str,
246
- table_name: str,
247
- columns: Dict[str, str],
248
- primary_keys: Optional[List[str]] = None,
249
- date_column: Optional[str] = None
250
- ):
251
- """
252
- 创建数据表
274
+ # 清理小数点后多余的零
275
+ if '.' in df_value:
276
+ df_value = re.sub(r'0+$', '', df_value).rstrip('.')
277
+ if '.' in mysql_value:
278
+ mysql_value = re.sub(r'0+$', '', mysql_value).rstrip('.')
253
279
 
254
- :param db_name: 数据库名
255
- :param table_name: 表名
256
- :param columns: 列名和数据类型字典 {列名: 数据类型}
257
- :param primary_keys: 主键列列表
258
- :param date_column: 日期列名,如果存在将设置为索引
259
- """
260
- db_name = self._validate_identifier(db_name)
261
- table_name = self._validate_identifier(table_name)
280
+ if df_value != mysql_value:
281
+ change_placeholders.append(f"`{raw_col}` = %s") # 动态转义列名
282
+ set_params.append(dict_data[raw_col])
283
+ change_col.append(raw_col)
262
284
 
263
- if not columns:
264
- raise ValueError("No columns specified for table creation")
285
+ if change_placeholders:
286
+ full_params = set_params + condition_params
287
+ sql = f"""UPDATE `{table_name}`
288
+ SET {','.join(change_placeholders)}
289
+ WHERE {' AND '.join(condition_parts)}"""
290
+ cursor.execute(sql, full_params)
291
+ else: # 没有数据返回,则直接插入数据
292
+ # 参数化插入
293
+ cols = ', '.join([f'`{k}`' for k in dict_data.keys()])
294
+ placeholders = ', '.join(['%s'] * len(dict_data))
295
+ sql = f"INSERT INTO `{table_name}` ({cols}) VALUES ({placeholders})"
296
+ cursor.execute(sql, tuple(dict_data.values()))
297
+ connection.commit() # 提交数据库
298
+ continue
265
299
 
266
- # 构建列定义SQL
267
- column_defs = []
268
- for col_name, col_type in columns.items():
269
- safe_col_name = self._validate_identifier(col_name)
270
- col_def = f"`{safe_col_name}` {col_type}"
271
- column_defs.append(col_def)
300
+ # 标准插入逻辑(参数化修改)
301
+ # 构造更新列(排除主键)
302
+ update_cols = [k for k in dict_data.keys()]
303
+ # 构建SQL
304
+ cols = ', '.join([f'`{k}`' for k in dict_data.keys()])
305
+ placeholders = ', '.join(['%s'] * len(dict_data))
306
+ update_clause = ', '.join([f'`{k}` = VALUES(`{k}`)' for k in update_cols]) or 'id=id'
272
307
 
273
- # 添加主键定义
274
- primary_key_sql = ""
275
- if primary_keys:
276
- safe_primary_keys = [self._validate_identifier(pk) for pk in primary_keys]
277
- primary_key_sql = f", PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
308
+ sql = f"""INSERT INTO `{table_name}` ({cols}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause}"""
309
+ # 执行参数化查询
310
+ try:
311
+ cursor.execute(sql, tuple(dict_data.values()))
312
+ connection.commit()
313
+ except pymysql.Error as e:
314
+ logger.error(f"插入失败: {e}\nSQL: {cursor.mogrify(sql, tuple(dict_data.values()))}")
315
+ connection.rollback()
316
+ connection.close()
278
317
 
279
- # 构建完整SQL
280
- sql = f"""
281
- CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
282
- {','.join(column_defs)}
283
- {primary_key_sql}
284
- ) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
318
+ # @try_except
319
+ def dict_to_mysql(self, db_name, table_name, dict_data, icm_update=None, index_length=100, set_typ=None, allow_not_null=False, cut_data=None):
320
+ """
321
+ 插入字典数据
322
+ dict_data: 字典
323
+ index_length: 索引长度
324
+ icm_update: 增量更新
325
+ set_typ: {}
326
+ allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
285
327
  """
328
+ if not self.config:
329
+ return
286
330
 
287
- with self._get_connection() as conn:
288
- with conn.cursor() as cursor:
289
- cursor.execute(sql)
331
+ if cut_data:
332
+ if '日期' in dict_data.keys():
333
+ try:
334
+ __y = pd.to_datetime(dict_data['日期']).strftime('%Y')
335
+ __y_m = pd.to_datetime(dict_data['日期']).strftime('%Y-%m')
336
+ if str(cut_data).lower() == 'year':
337
+ table_name = f'{table_name}_{__y}'
338
+ elif str(cut_data).lower() == 'month':
339
+ table_name = f'{table_name}_{__y_m}'
340
+ else:
341
+ logger.info(f'参数不正确,cut_data应为 year 或 month ')
342
+ except Exception as e:
343
+ logger.error(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
290
344
 
291
- # 如果存在日期列,添加索引
292
- if date_column and date_column in columns:
293
- safe_date_col = self._validate_identifier(date_column)
294
- index_sql = f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_date_col}` (`{safe_date_col}`)"
295
- with conn.cursor() as cursor:
296
- cursor.execute(index_sql)
345
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
346
+ if not connection:
347
+ return
348
+ with connection.cursor() as cursor:
349
+ cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
350
+ database_exists = cursor.fetchone()
351
+ if not database_exists:
352
+ # 如果数据库不存在,则新建
353
+ sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
354
+ cursor.execute(sql)
355
+ connection.commit()
356
+ logger.info(f"创建Database: {db_name}")
297
357
 
298
- conn.commit()
358
+ self.config.update({'database': db_name}) # 添加更新 config 字段
359
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
360
+ if not connection:
361
+ return
362
+ with connection.cursor() as cursor:
363
+ # 1. 查询表, 不存在则创建一个空表
364
+ sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
365
+ cursor.execute(sql, (table_name,))
366
+ if not cursor.fetchone():
367
+ sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
368
+ cursor.execute(sql)
369
+ logger.info(f'创建 mysql 表: {table_name}')
299
370
 
300
- def _prepare_data(
301
- self,
302
- data: Union[Dict, List[Dict], pd.DataFrame],
303
- columns: Dict[str, str],
304
- allow_null: bool = False
305
- ) -> List[Dict]:
306
- """
307
- 准备要上传的数据,验证并转换数据类型
371
+ # 根据 dict_data 的值添加指定的数据类型
372
+ dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
373
+ if set_typ:
374
+ # 更新自定义的列数据类型
375
+ for k, v in dtypes.copy().items():
376
+ # 确保传进来的 set_typ 键存在于实际的 df 列才 update
377
+ [dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
308
378
 
309
- :param data: 输入数据
310
- :param columns: 列名和数据类型字典 {列名: 数据类型}
311
- :param allow_null: 是否允许空值
312
- :return: 准备好的数据列表
313
- """
314
- # 统一数据格式为字典列表
315
- if isinstance(data, pd.DataFrame):
316
- data = data.to_dict('records')
317
- elif isinstance(data, dict):
318
- data = [data]
379
+ # 检查列
380
+ sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
381
+ cursor.execute(sql, (db_name, table_name))
382
+ col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()] # 已存在的所有列
383
+ col_not_exist = [col for col in dict_data.keys() if col not in col_exist] # 不存在的列
384
+ # 不存在则新建列
385
+ if col_not_exist: # 数据表中不存在的列
386
+ for col in col_not_exist:
387
+ # 创建列,需转义
388
+ if allow_not_null:
389
+ sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]};"
390
+ else:
391
+ sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
392
+ cursor.execute(sql)
393
+ logger.info(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
319
394
 
320
- if not isinstance(data, list) or not all(isinstance(item, dict) for item in data):
321
- raise ValueError("Data must be a dict, list of dicts, or DataFrame")
395
+ if col == '日期':
396
+ sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
397
+ logger.info(f"设置为索引: {col}({dtypes[col]})")
398
+ cursor.execute(sql)
399
+ connection.commit() # 提交事务
400
+ """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
401
+ """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
402
+ # 处理插入的数据
403
+ if icm_update:
404
+ """ 使用增量更新: 需确保 icm_update['主键'] 传进来的列组合是数据表中唯一,值不会发生变化且不会重复,否则可能产生覆盖 """
405
+ sql = """SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s"""
406
+ cursor.execute(sql, (db_name, table_name))
407
+ cols_exist = [col['COLUMN_NAME'] for col in cursor.fetchall()] # 数据表的所有列, 返回 list
322
408
 
323
- prepared_data = []
324
- for row in data:
325
- prepared_row = {}
326
- for col_name, col_type in columns.items():
327
- if col_name not in row:
328
- if not allow_null:
329
- raise ValueError(f"Missing required column '{col_name}' in data")
330
- prepared_row[col_name] = None
331
- else:
332
- try:
333
- prepared_row[col_name] = self._validate_value(row[col_name], col_type)
334
- except ValueError as e:
335
- raise ValueError(f"Error in column '{col_name}': {str(e)}")
336
- prepared_data.append(prepared_row)
409
+ # 保留原始列名,不提前转义
410
+ raw_update_col = [item for item in cols_exist if item not in icm_update and item != 'id']
337
411
 
338
- return prepared_data
412
+ # 构建条件参数(使用原始列名)
413
+ condition_params = []
414
+ condition_parts = []
415
+ for up_col in icm_update:
416
+ condition_parts.append(f"`{up_col}` = %s") # SQL 转义
417
+ condition_params.append(dict_data[up_col]) # 原始列名访问数据
339
418
 
340
- def _get_partition_table_name(self, table_name: str, date_value: str, partition_by: str) -> str:
341
- """
342
- 获取分表名称
419
+ # 动态转义列名生成 SQL 查询字段
420
+ escaped_update_col = [f'`{col}`' for col in raw_update_col]
421
+ sql = f"""SELECT {','.join(escaped_update_col)} FROM `{table_name}` WHERE {' AND '.join(condition_parts)}"""
422
+ cursor.execute(sql, condition_params)
423
+ results = cursor.fetchall()
343
424
 
344
- :param table_name: 基础表名
345
- :param date_value: 日期值
346
- :param partition_by: 分表方式 ('year' 或 'month')
347
- :return: 分表名称
348
- """
349
- try:
350
- date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d %H:%M:%S')
351
- except ValueError:
352
- try:
353
- date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d')
354
- except ValueError:
355
- raise ValueError(f"Invalid date format: {date_value}")
356
-
357
- if partition_by == 'year':
358
- return f"{table_name}_{date_obj.year}"
359
- elif partition_by == 'month':
360
- return f"{table_name}_{date_obj.year}_{date_obj.month:02d}"
361
- else:
362
- raise ValueError("partition_by must be 'year' or 'month'")
363
-
364
- def _insert_data(
365
- self,
366
- db_name: str,
367
- table_name: str,
368
- data: List[Dict],
369
- columns: Dict[str, str],
370
- check_duplicate: bool = False,
371
- duplicate_columns: Optional[List[str]] = None,
372
- batch_size: int = 1000
373
- ):
374
- """
375
- 插入数据到表中
376
-
377
- :param db_name: 数据库名
378
- :param table_name: 表名
379
- :param data: 要插入的数据
380
- :param columns: 列名和数据类型字典
381
- :param check_duplicate: 是否检查重复
382
- :param duplicate_columns: 用于检查重复的列列表
383
- :param batch_size: 批量插入的大小
384
- """
385
- db_name = self._validate_identifier(db_name)
386
- table_name = self._validate_identifier(table_name)
387
-
388
- if not data:
389
- return
390
-
391
- # 获取所有列名
392
- all_columns = list(columns.keys())
393
- safe_columns = [self._validate_identifier(col) for col in all_columns]
394
- placeholders = ','.join(['%s'] * len(safe_columns))
395
-
396
- # 构建INSERT SQL
397
- if check_duplicate:
398
- if duplicate_columns:
399
- # 只检查指定列的重复
400
- dup_columns = [self._validate_identifier(col) for col in duplicate_columns]
401
- else:
402
- # 检查所有列的重复
403
- dup_columns = safe_columns
404
-
405
- # 构建ON DUPLICATE KEY UPDATE子句
406
- update_clause = ','.join([f"`{col}`=VALUES(`{col}`)" for col in safe_columns])
407
-
408
- sql = f"""
409
- INSERT INTO `{db_name}`.`{table_name}`
410
- (`{'`,`'.join(safe_columns)}`)
411
- VALUES ({placeholders})
412
- ON DUPLICATE KEY UPDATE {update_clause}
413
- """
414
- else:
415
- sql = f"""
416
- INSERT INTO `{db_name}`.`{table_name}`
417
- (`{'`,`'.join(safe_columns)}`)
418
- VALUES ({placeholders})
419
- """
420
-
421
- # 分批插入数据
422
- with self._get_connection() as conn:
423
- with conn.cursor() as cursor:
424
- for i in range(0, len(data), batch_size):
425
- batch = data[i:i + batch_size]
426
- # 准备批量数据
427
- values = []
428
- for row in batch:
429
- row_values = []
430
- for col in all_columns:
431
- row_values.append(row.get(col))
432
- values.append(row_values)
433
-
434
- # 执行批量插入
435
- try:
436
- cursor.executemany(sql, values)
437
- conn.commit()
438
- except Exception as e:
439
- conn.rollback()
440
- raise e
441
-
442
- def upload_data(
443
- self,
444
- db_name: str,
445
- table_name: str,
446
- data: Union[Dict, List[Dict], pd.DataFrame],
447
- columns: Dict[str, str],
448
- primary_keys: Optional[List[str]] = None,
449
- check_duplicate: bool = False,
450
- duplicate_columns: Optional[List[str]] = None,
451
- allow_null: bool = False,
452
- partition_by: Optional[str] = None,
453
- partition_date_column: str = '日期',
454
- auto_create: bool = True
455
- ):
456
- """
457
- 上传数据到数据库
458
-
459
- :param db_name: 数据库名
460
- :param table_name: 表名
461
- :param data: 要上传的数据
462
- :param columns: 列名和数据类型字典 {列名: 数据类型}
463
- :param primary_keys: 主键列列表
464
- :param check_duplicate: 是否检查重复,默认为False
465
- :param duplicate_columns: 用于检查重复的列列表,如果不指定则使用所有列
466
- :param allow_null: 是否允许空值,默认为False
467
- :param partition_by: 分表方式 ('year' 或 'month'),默认为None不分表
468
- :param partition_date_column: 用于分表的日期列名,默认为'日期'
469
- :param auto_create: 是否自动创建不存在的数据库或表,默认为True
470
- """
471
- # 验证参数
472
- if not columns:
473
- raise ValueError("Columns specification is required")
474
-
475
- if partition_by and partition_by not in ['year', 'month']:
476
- raise ValueError("partition_by must be 'year', 'month' or None")
477
-
478
- # 准备数据
479
- prepared_data = self._prepare_data(data, columns, allow_null)
480
-
481
- # 检查数据库是否存在
482
- if not self._check_database_exists(db_name):
483
- if auto_create:
484
- self._create_database(db_name)
485
- else:
486
- raise ValueError(f"Database '{db_name}' does not exist")
487
-
488
- # 处理分表逻辑
489
- if partition_by:
490
- # 分组数据按分表
491
- partitioned_data = {}
492
- for row in prepared_data:
493
- if partition_date_column not in row:
494
- raise ValueError(f"Partition date column '{partition_date_column}' not found in data")
495
- part_table = self._get_partition_table_name(table_name, str(row[partition_date_column]), partition_by)
496
- if part_table not in partitioned_data:
497
- partitioned_data[part_table] = []
498
- partitioned_data[part_table].append(row)
499
-
500
- # 对每个分表执行上传
501
- for part_table, part_data in partitioned_data.items():
502
- self._upload_to_table(
503
- db_name, part_table, part_data, columns,
504
- primary_keys, check_duplicate, duplicate_columns,
505
- allow_null, auto_create, partition_date_column
506
- )
507
- else:
508
- # 不分表,直接上传
509
- self._upload_to_table(
510
- db_name, table_name, prepared_data, columns,
511
- primary_keys, check_duplicate, duplicate_columns,
512
- allow_null, auto_create, partition_date_column
513
- )
514
-
515
- def _upload_to_table(
516
- self,
517
- db_name: str,
518
- table_name: str,
519
- data: List[Dict],
520
- columns: Dict[str, str],
521
- primary_keys: Optional[List[str]],
522
- check_duplicate: bool,
523
- duplicate_columns: Optional[List[str]],
524
- allow_null: bool,
525
- auto_create: bool,
526
- date_column: Optional[str]
527
- ):
528
- """实际执行表上传的内部方法"""
529
- # 检查表是否存在
530
- if not self._check_table_exists(db_name, table_name):
531
- if auto_create:
532
- self._create_table(db_name, table_name, columns, primary_keys, date_column)
533
- else:
534
- raise ValueError(f"Table '{db_name}.{table_name}' does not exist")
535
-
536
- # 获取表结构并验证
537
- table_columns = self._get_table_columns(db_name, table_name)
538
- if not table_columns:
539
- raise ValueError(f"Failed to get columns for table '{db_name}.{table_name}'")
540
-
541
- # 验证数据列与表列匹配
542
- for col in columns:
543
- if col not in table_columns:
544
- raise ValueError(f"Column '{col}' not found in table '{db_name}.{table_name}'")
545
-
546
- # 插入数据
547
- self._insert_data(
548
- db_name, table_name, data, columns,
549
- check_duplicate, duplicate_columns
550
- )
551
-
552
- def close(self):
553
- """关闭连接池"""
554
- self.pool.close()
555
-
556
- def __enter__(self):
557
- return self
558
-
559
- def __exit__(self, exc_type, exc_val, exc_tb):
560
- self.close()
561
-
562
-
563
- class MysqlUpload:
564
- def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
565
- self.username = username
566
- self.password = password
567
- self.host = host
568
- self.port = port
569
- if username == '' or password == '' or host == '' or port == 0:
570
- self.config = None
571
- else:
572
- self.config = {
573
- 'host': self.host,
574
- 'port': int(self.port),
575
- 'user': self.username,
576
- 'password': self.password,
577
- 'charset': charset, # utf8mb4 支持存储四字节的UTF-8字符集
578
- 'cursorclass': pymysql.cursors.DictCursor,
579
- }
580
- self.filename = None
425
+ if results:
426
+ for result in results:
427
+ change_col = []
428
+ change_placeholders = []
429
+ set_params = []
430
+ for raw_col in raw_update_col:
431
+ # 使用原始列名访问数据
432
+ df_value = str(dict_data[raw_col])
433
+ mysql_value = str(result[raw_col])
581
434
 
582
- @staticmethod
583
- def try_except(func): # 在类内部定义一个异常处理方法
435
+ # 清理小数点后多余的零
436
+ if '.' in df_value:
437
+ df_value = re.sub(r'0+$', '', df_value).rstrip('.')
438
+ if '.' in mysql_value:
439
+ mysql_value = re.sub(r'0+$', '', mysql_value).rstrip('.')
584
440
 
585
- @wraps(func)
586
- def wrapper(*args, **kwargs):
587
- try:
588
- return func(*args, **kwargs)
589
- except Exception as e:
590
- logger.error(f'{func.__name__}, {e}') # 将异常信息返回
441
+ if df_value != mysql_value:
442
+ change_placeholders.append(f"`{raw_col}` = %s") # 动态转义列名
443
+ set_params.append(dict_data[raw_col])
444
+ change_col.append(raw_col)
591
445
 
592
- return wrapper
446
+ if change_placeholders:
447
+ full_params = set_params + condition_params
448
+ sql = f"""UPDATE `{table_name}`
449
+ SET {','.join(change_placeholders)}
450
+ WHERE {' AND '.join(condition_parts)}"""
451
+ cursor.execute(sql, full_params)
452
+ else: # 没有数据返回,则直接插入数据
453
+ # 参数化插入语句
454
+ keys = [f"`{k}`" for k in dict_data.keys()]
455
+ placeholders = ','.join(['%s'] * len(dict_data))
456
+ update_clause = ','.join([f"`{k}`=VALUES(`{k}`)" for k in dict_data.keys()])
457
+ sql = f"""INSERT INTO `{table_name}` ({','.join(keys)}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause}"""
458
+ cursor.execute(sql, tuple(dict_data.values()))
459
+ connection.commit() # 提交数据库
460
+ connection.close()
461
+ return
593
462
 
594
- def keep_connect(self, _db_name, _config, max_try: int=10):
595
- attempts = 1
596
- while attempts <= max_try:
597
- try:
598
- connection = pymysql.connect(**_config) # 连接数据库
599
- return connection
600
- except Exception as e:
601
- logger.error(f'{_db_name}: 连接失败,正在重试: {self.host}:{self.port} {attempts}/{max_try} {e}')
602
- attempts += 1
603
- time.sleep(30)
604
- logger.error(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
605
- return None
463
+ # 常规插入处理(参数化)
464
+ keys = [f"`{k}`" for k in dict_data.keys()]
465
+ placeholders = ','.join(['%s'] * len(dict_data))
466
+ update_clause = ','.join([f"`{k}`=VALUES(`{k}`)" for k in dict_data.keys()])
467
+ sql = f"""INSERT INTO `{table_name}` ({','.join(keys)}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause}"""
468
+ cursor.execute(sql, tuple(dict_data.values()))
469
+ connection.commit()
470
+ connection.close()
606
471
 
607
- def cover_doc_dtypes(self, dict_data):
472
+ def cover_dict_dtypes(self, dict_data):
608
473
  """ 清理字典键值 并转换数据类型 """
609
474
  if not dict_data:
610
475
  logger.info(f'mysql.py -> MysqlUpload -> cover_dict_dtypes -> 传入的字典不能为空')
@@ -617,6 +482,14 @@ class MysqlUpload:
617
482
  k = k.replace(')', '')
618
483
  k = re.sub(r'_{2,}', '_', k)
619
484
  k = re.sub(r'_+$', '', k)
485
+ if str(v) == '':
486
+ v = 0
487
+ v = str(v)
488
+ v = re.sub('^="|"$', '', v, re.I)
489
+ v = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', str(v)) # 移除控制字符
490
+ if re.findall(r'^[-+]?\d+\.?\d*%$', v):
491
+ v = str(float(v.rstrip("%")) / 100)
492
+
620
493
  result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
621
494
  result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
622
495
  result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
@@ -641,6 +514,8 @@ class MysqlUpload:
641
514
  __res_dict.update({k: 'INT'})
642
515
  elif count_float > 0:
643
516
  if count_int + count_float > 10:
517
+ # if count_float > 5:
518
+ # v = round(float(v), 4)
644
519
  if count_float >= 6:
645
520
  __res_dict.update({k: 'decimal(14,6)'})
646
521
  else:
@@ -654,45 +529,110 @@ class MysqlUpload:
654
529
  else:
655
530
  __res_dict.update({k: 'varchar(255)'})
656
531
  new_dict_data.update({k: v})
657
- __res_dict.update({'数据主体': 'longblob'})
658
532
  return __res_dict, new_dict_data
659
533
 
534
+ def convert_df_dtypes(self, df: pd.DataFrame):
535
+ """ 清理 df 的值和列名,并转换数据类型 """
536
+ df = otk.cover_df(df=df) # 清理 df 的值和列名
537
+ [pd.to_numeric(df[col], errors='ignore') for col in df.columns.tolist()]
538
+ dtypes = df.dtypes.to_dict()
539
+ __res_dict = {}
540
+ for k, v in dtypes.copy().items():
541
+ result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
542
+ result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
543
+ result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
544
+ result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
545
+
546
+ if result1: # id/sku/spu商品信息
547
+ __res_dict.update({k: 'varchar(50)'})
548
+ elif result2: # 小数
549
+ __res_dict.update({k: 'decimal(10,4)'})
550
+ elif result3: # 小数
551
+ __res_dict.update({k: 'decimal(12,4)'})
552
+ elif result4: # 小数
553
+ __res_dict.update({k: 'decimal(12,2)'})
554
+ elif k == '日期':
555
+ __res_dict.update({k: 'date'})
556
+ elif k == '更新时间':
557
+ __res_dict.update({k: 'timestamp'})
558
+ elif v == 'int64':
559
+ __res_dict.update({k: 'int'})
560
+ elif v == 'float64':
561
+ __res_dict.update({k: 'decimal(10,4)'})
562
+ elif v == 'bool':
563
+ __res_dict.update({k: 'boolean'})
564
+ elif v == 'datetime64[ns]':
565
+ __res_dict.update({k: 'datetime'})
566
+ else:
567
+ __res_dict.update({k: 'varchar(255)'})
568
+ return __res_dict, df
569
+
660
570
  @try_except
661
- def insert_many_dict(self, db_name, table_name, dict_data_list, icm_update=None, index_length=100, set_typ=None, allow_not_null=False, cut_data=None):
571
+ def df_to_mysql(self, df, db_name, table_name, set_typ=None, icm_update=[], move_insert=False, df_sql=False,
572
+ filename=None, count=None, allow_not_null=False, cut_data=None):
662
573
  """
663
- 插入字典数据
664
- dict_data: 字典
665
- index_length: 索引长度
666
- icm_update: 增量更正
667
- set_typ: {}
574
+ db_name: 数据库名
575
+ table_name: 表名
576
+ move_insert: 根据df 的日期,先移除数据库数据,再插入, df_sql, icm_update 都要设置为 False
577
+ 原则上只限于聚合数据使用,原始数据插入时不要设置
578
+ df_sql: 这是一个临时参数, 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重,初创表大量上传数据的时候使用
579
+ icm_update: 增量更新, 在聚合数据中使用,原始文件不要使用
580
+ 使用增量更新: 必须确保 icm_update 传进来的列必须是数据表中唯一主键,值不会发生变化,不会重复,否则可能产生错乱覆盖情况
581
+ filename: 用来追踪处理进度,传这个参数是方便定位产生错误的文件
668
582
  allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
669
583
  """
670
584
  if not self.config:
671
585
  return
586
+ if icm_update:
587
+ if move_insert or df_sql:
588
+ logger.info(f'icm_update/move_insert/df_sql 参数不能同时设定')
589
+ return
590
+ if move_insert:
591
+ if icm_update or df_sql:
592
+ logger.info(f'icm_update/move_insert/df_sql 参数不能同时设定')
593
+ return
672
594
 
673
- if not dict_data_list:
674
- logger.info(f'dict_data_list 不能为空 ')
595
+ self.filename = filename
596
+ if isinstance(df, pd.DataFrame):
597
+ if len(df) == 0:
598
+ logger.info(f'{db_name}: {table_name} 传入的 df 数据长度为0, {self.filename}')
599
+ return
600
+ else:
601
+ logger.info(f'{db_name}: {table_name} 传入的 df 不是有效的 dataframe 结构, {self.filename}')
675
602
  return
676
- dict_data = dict_data_list[0]
603
+ if not db_name or db_name == 'None':
604
+ logger.info(f'{db_name} 不能为 None')
605
+ return
606
+
677
607
  if cut_data:
678
- if '日期' in dict_data.keys():
608
+ if '日期' in df.columns.tolist():
679
609
  try:
680
- __y = pd.to_datetime(dict_data['日期']).strftime('%Y')
681
- __y_m = pd.to_datetime(dict_data['日期']).strftime('%Y-%m')
610
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
611
+ min_year = df['日期'].min(skipna=True).year
612
+ min_month = df['日期'].min(skipna=True).month
613
+ if 0 < int(min_month) < 10 and not str(min_month).startswith('0'):
614
+ min_month = f'0{min_month}'
682
615
  if str(cut_data).lower() == 'year':
683
- table_name = f'{table_name}_{__y}'
616
+ table_name = f'{table_name}_{min_year}'
684
617
  elif str(cut_data).lower() == 'month':
685
- table_name = f'{table_name}_{__y_m}'
618
+ table_name = f'{table_name}_{min_year}-{min_month}'
686
619
  else:
687
620
  logger.info(f'参数不正确,cut_data应为 year 或 month ')
688
621
  except Exception as e:
689
622
  logger.error(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
623
+ # 清理 dataframe 非法值,并转换获取数据类型
624
+ dtypes, df = self.convert_df_dtypes(df)
625
+ if set_typ:
626
+ # 更新自定义的列数据类型
627
+ for k, v in dtypes.copy().items():
628
+ # 确保传进来的 set_typ 键存在于实际的 df 列才 update
629
+ [dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
690
630
 
691
631
  connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
692
632
  if not connection:
693
633
  return
694
634
  with connection.cursor() as cursor:
695
- cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
635
+ cursor.execute("SHOW DATABASES LIKE %s", (db_name,)) # 检查数据库是否存在
696
636
  database_exists = cursor.fetchone()
697
637
  if not database_exists:
698
638
  # 如果数据库不存在,则新建
@@ -710,917 +650,1582 @@ class MysqlUpload:
710
650
  sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
711
651
  cursor.execute(sql, (table_name,))
712
652
  if not cursor.fetchone():
713
- sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
714
- cursor.execute(sql)
653
+ create_table_sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY)"
654
+ cursor.execute(create_table_sql)
715
655
  logger.info(f'创建 mysql 表: {table_name}')
716
656
 
717
- # 根据 dict_data 的值添加指定的数据类型
718
- dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
719
- if set_typ:
720
- # 更新自定义的列数据类型
721
- for k, v in dtypes.copy().items():
722
- # 确保传进来的 set_typ 键存在于实际的 df 列才 update
723
- [dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
724
-
725
- # 检查列
657
+ # 有特殊字符不需转义
726
658
  sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
727
659
  cursor.execute(sql, (db_name, table_name))
728
- col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()] # 已存在的所有列
729
- col_not_exist = [col for col in dict_data.keys() if col not in col_exist] # 不存在的列
730
- # 不存在则新建列
660
+ col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()]
661
+ cols = df.columns.tolist()
662
+ col_not_exist = [col for col in cols if col not in col_exist]
663
+
664
+ # 检查列,不存在则新建列
731
665
  if col_not_exist: # 数据表中不存在的列
732
666
  for col in col_not_exist:
733
667
  # 创建列,需转义
734
- if allow_not_null:
735
- sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]};"
736
- else:
737
- sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
738
-
739
- cursor.execute(sql)
668
+ alter_sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]}"
669
+ if not allow_not_null:
670
+ alter_sql += " NOT NULL"
671
+ cursor.execute(alter_sql)
740
672
  logger.info(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
741
673
 
674
+ # 创建索引
742
675
  if col == '日期':
743
- sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
744
- logger.info(f"设置为索引: {col}({dtypes[col]})")
745
- cursor.execute(sql)
746
-
676
+ sql = f"SHOW INDEXES FROM `{table_name}` WHERE `Column_name` = %s"
677
+ cursor.execute(sql, (col,))
678
+ result = cursor.fetchone() # 检查索引是否存在
679
+ if not result:
680
+ cursor.execute(f"CREATE INDEX index_name ON `{table_name}`(`{col}`)")
747
681
  connection.commit() # 提交事务
748
- """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
749
- """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
750
- # 处理插入的数据
751
- for dict_data in dict_data_list:
752
- dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
753
- if icm_update:
754
- """ 使用增量更新: 需确保 icm_update['主键'] 传进来的列组合是数据表中唯一,值不会发生变化且不会重复,否则可能产生覆盖 """
755
- sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
756
- cursor.execute(sql, (db_name, table_name))
757
- columns = cursor.fetchall()
758
- cols_exist = [col['COLUMN_NAME'] for col in columns] # 数据表的所有列, 返回 list
759
- # 保留原始列名,不提前转义
760
- raw_update_col = [item for item in cols_exist if item not in icm_update and item != 'id'] # 除了主键外的其他列
761
682
 
762
- # 构建条件参数(使用原始列名)
763
- condition_params = []
764
- condition_parts = []
765
- for up_col in icm_update:
766
- condition_parts.append(f"`{up_col}` = %s") # SQL 转义
767
- condition_params.append(dict_data[up_col]) # 原始列名用于访问数据
768
-
769
- # 动态转义列名生成 SQL 查询字段
770
- escaped_update_col = [f'`{col}`' for col in raw_update_col]
771
- sql = f"""SELECT {','.join(escaped_update_col)} FROM `{table_name}` WHERE {' AND '.join(condition_parts)}"""
772
- cursor.execute(sql, condition_params)
773
- results = cursor.fetchall()
683
+ if df_sql:
684
+ logger.info(f'正在更新: mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count}, {self.filename}')
685
+ engine = create_engine(
686
+ f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
687
+ df.to_sql(
688
+ name=table_name,
689
+ con=engine,
690
+ if_exists='append',
691
+ index=False,
692
+ chunksize=1000,
693
+ method='multi'
694
+ )
695
+ connection.commit() # 提交事务
696
+ connection.close()
697
+ return
774
698
 
775
- if results:
776
- for result in results:
777
- change_col = []
778
- change_placeholders = []
779
- set_params = []
780
- for raw_col in raw_update_col:
781
- # 使用原始列名访问数据
782
- df_value = str(dict_data[raw_col])
783
- mysql_value = str(result[raw_col])
699
+ # 5. 移除指定日期范围内的数据,原则上只限于聚合数据使用,原始数据插入时不要设置
700
+ if move_insert and '日期' in df.columns.tolist():
701
+ # 移除数据
702
+ dates = df['日期'].values.tolist()
703
+ dates = [pd.to_datetime(item) for item in dates] # 需要先转换类型才能用 min, max
704
+ start_date = pd.to_datetime(min(dates)).strftime('%Y-%m-%d')
705
+ end_date = (pd.to_datetime(max(dates)) + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
784
706
 
785
- # 清理小数点后多余的零
786
- if '.' in df_value:
787
- df_value = re.sub(r'0+$', '', df_value).rstrip('.')
788
- if '.' in mysql_value:
789
- mysql_value = re.sub(r'0+$', '', mysql_value).rstrip('.')
707
+ delete_sql = f"""
708
+ DELETE FROM `{table_name}`
709
+ WHERE 日期 BETWEEN %s AND %s
710
+ """
711
+ cursor.execute(delete_sql, (start_date, end_date))
712
+ connection.commit()
790
713
 
791
- if df_value != mysql_value:
792
- change_placeholders.append(f"`{raw_col}` = %s") # 动态转义列名
793
- set_params.append(dict_data[raw_col])
794
- change_col.append(raw_col)
714
+ # 插入数据
715
+ engine = create_engine(
716
+ f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
717
+ df.to_sql(
718
+ name=table_name,
719
+ con=engine,
720
+ if_exists='append',
721
+ index=False,
722
+ chunksize=1000,
723
+ method='multi'
724
+ )
725
+ return
795
726
 
796
- if change_placeholders:
797
- full_params = set_params + condition_params
798
- sql = f"""UPDATE `{table_name}`
799
- SET {','.join(change_placeholders)}
800
- WHERE {' AND '.join(condition_parts)}"""
801
- cursor.execute(sql, full_params)
802
- else: # 没有数据返回,则直接插入数据
803
- # 参数化插入
804
- cols = ', '.join([f'`{k}`' for k in dict_data.keys()])
805
- placeholders = ', '.join(['%s'] * len(dict_data))
806
- sql = f"INSERT INTO `{table_name}` ({cols}) VALUES ({placeholders})"
807
- cursor.execute(sql, tuple(dict_data.values()))
808
- connection.commit() # 提交数据库
809
- continue
727
+ datas = df.to_dict(orient='records')
728
+ for data in datas:
729
+ # data 是传进来待处理的数据, 不是数据库数据
730
+ # data 示例: {'日期': Timestamp('2024-08-27 00:00:00'), '推广费余额': 33299, '品销宝余额': 2930.73, '短信剩余': 67471}
731
+ try:
732
+ # 预处理数据:转换非字符串类型
733
+ processed_data = {}
734
+ for k, v in data.items():
735
+ if isinstance(v, (int, float)):
736
+ processed_data[k] = float(v)
737
+ elif isinstance(v, pd.Timestamp):
738
+ processed_data[k] = v.strftime('%Y-%m-%d')
739
+ else:
740
+ processed_data[k] = str(v)
810
741
 
811
- # 标准插入逻辑(参数化修改)
812
- # 构造更新列(排除主键)
813
- update_cols = [k for k in dict_data.keys()]
814
- # 构建SQL
815
- cols = ', '.join([f'`{k}`' for k in dict_data.keys()])
816
- placeholders = ', '.join(['%s'] * len(dict_data))
817
- update_clause = ', '.join([f'`{k}` = VALUES(`{k}`)' for k in update_cols]) or 'id=id'
742
+ # 构建基础SQL要素
743
+ columns = [f'`{k}`' for k in processed_data.keys()]
744
+ placeholders = ', '.join(['%s'] * len(processed_data))
745
+ values = list(processed_data.values())
818
746
 
819
- sql = f"""INSERT INTO `{table_name}` ({cols}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause}"""
820
- # 执行参数化查询
821
- try:
822
- cursor.execute(sql, tuple(dict_data.values()))
823
- connection.commit()
824
- except pymysql.Error as e:
825
- logger.error(f"插入失败: {e}\nSQL: {cursor.mogrify(sql, tuple(dict_data.values()))}")
826
- connection.rollback()
827
- connection.close()
747
+ # 构建基本INSERT语句
748
+ insert_sql = f"INSERT INTO `{table_name}` ({', '.join(columns)}) VALUES ({placeholders})"
828
749
 
829
- # @try_except
830
- def dict_to_mysql(self, db_name, table_name, dict_data, icm_update=None, index_length=100, set_typ=None, allow_not_null=False, cut_data=None):
831
- """
832
- 插入字典数据
833
- dict_data: 字典
834
- index_length: 索引长度
835
- icm_update: 增量更新
836
- set_typ: {}
837
- allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
838
- """
839
- if not self.config:
840
- return
750
+ if icm_update: # 增量更新, 专门用于聚合数据,其他库不要调用
751
+ # 获取数据表结构
752
+ cursor.execute(
753
+ "SELECT COLUMN_NAME FROM information_schema.columns "
754
+ "WHERE table_schema = %s AND table_name = %s",
755
+ (db_name, table_name)
756
+ )
757
+ cols_exist = [row['COLUMN_NAME'] for row in cursor.fetchall()]
758
+ update_columns = [col for col in cols_exist if col not in icm_update and col != 'id']
841
759
 
842
- if cut_data:
843
- if '日期' in dict_data.keys():
844
- try:
845
- __y = pd.to_datetime(dict_data['日期']).strftime('%Y')
846
- __y_m = pd.to_datetime(dict_data['日期']).strftime('%Y-%m')
847
- if str(cut_data).lower() == 'year':
848
- table_name = f'{table_name}_{__y}'
849
- elif str(cut_data).lower() == 'month':
850
- table_name = f'{table_name}_{__y_m}'
851
- else:
852
- logger.info(f'参数不正确,cut_data应为 year 或 month ')
853
- except Exception as e:
854
- logger.error(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
760
+ # 构建WHERE条件
761
+ where_conditions = []
762
+ where_values = []
763
+ for col in icm_update:
764
+ where_conditions.append(f"`{col}` = %s")
765
+ where_values.append(processed_data[col])
855
766
 
856
- connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
857
- if not connection:
858
- return
859
- with connection.cursor() as cursor:
860
- cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
861
- database_exists = cursor.fetchone()
862
- if not database_exists:
863
- # 如果数据库不存在,则新建
864
- sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
865
- cursor.execute(sql)
866
- connection.commit()
867
- logger.info(f"创建Database: {db_name}")
767
+ # 查询现有数据
768
+ select_sql = f"SELECT {', '.join([f'`{col}`' for col in update_columns])} " \
769
+ f"FROM `{table_name}` WHERE {' AND '.join(where_conditions)}"
770
+ cursor.execute(select_sql, where_values)
771
+ existing_data = cursor.fetchone()
868
772
 
869
- self.config.update({'database': db_name}) # 添加更新 config 字段
870
- connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
871
- if not connection:
872
- return
873
- with connection.cursor() as cursor:
874
- # 1. 查询表, 不存在则创建一个空表
875
- sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
876
- cursor.execute(sql, (table_name,))
877
- if not cursor.fetchone():
878
- sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
879
- cursor.execute(sql)
880
- logger.info(f'创建 mysql 表: {table_name}')
773
+ if existing_data:
774
+ # 比较并构建更新语句
775
+ update_set = []
776
+ update_values = []
777
+ for col in update_columns:
778
+ db_value = existing_data[col]
779
+ new_value = processed_data[col]
881
780
 
882
- # 根据 dict_data 的值添加指定的数据类型
883
- dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
884
- if set_typ:
885
- # 更新自定义的列数据类型
886
- for k, v in dtypes.copy().items():
887
- # 确保传进来的 set_typ 键存在于实际的 df 列才 update
888
- [dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
781
+ # 处理数值类型的精度差异
782
+ if isinstance(db_value, float) and isinstance(new_value, float):
783
+ if not math.isclose(db_value, new_value, rel_tol=1e-9):
784
+ update_set.append(f"`{col}` = %s")
785
+ update_values.append(new_value)
786
+ elif db_value != new_value:
787
+ update_set.append(f"`{col}` = %s")
788
+ update_values.append(new_value)
889
789
 
890
- # 检查列
891
- sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
892
- cursor.execute(sql, (db_name, table_name))
893
- col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()] # 已存在的所有列
894
- col_not_exist = [col for col in dict_data.keys() if col not in col_exist] # 不存在的列
895
- # 不存在则新建列
896
- if col_not_exist: # 数据表中不存在的列
897
- for col in col_not_exist:
898
- # 创建列,需转义
899
- if allow_not_null:
900
- sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]};"
790
+ if update_set:
791
+ update_sql = f"UPDATE `{table_name}` SET {', '.join(update_set)} " \
792
+ f"WHERE {' AND '.join(where_conditions)}"
793
+ cursor.execute(update_sql, update_values + where_values)
794
+ else:
795
+ cursor.execute(insert_sql, values)
901
796
  else:
902
- sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
903
- cursor.execute(sql)
904
- logger.info(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
905
-
906
- if col == '日期':
907
- sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
908
- logger.info(f"设置为索引: {col}({dtypes[col]})")
909
- cursor.execute(sql)
910
- connection.commit() # 提交事务
911
- """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
912
- """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
913
- # 处理插入的数据
914
- if icm_update:
915
- """ 使用增量更新: 需确保 icm_update['主键'] 传进来的列组合是数据表中唯一,值不会发生变化且不会重复,否则可能产生覆盖 """
916
- sql = """SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s"""
917
- cursor.execute(sql, (db_name, table_name))
918
- cols_exist = [col['COLUMN_NAME'] for col in cursor.fetchall()] # 数据表的所有列, 返回 list
919
-
920
- # 保留原始列名,不提前转义
921
- raw_update_col = [item for item in cols_exist if item not in icm_update and item != 'id']
922
-
923
- # 构建条件参数(使用原始列名)
924
- condition_params = []
925
- condition_parts = []
926
- for up_col in icm_update:
927
- condition_parts.append(f"`{up_col}` = %s") # SQL 转义
928
- condition_params.append(dict_data[up_col]) # 原始列名访问数据
797
+ # 普通插入
798
+ cursor.execute(insert_sql, values)
799
+ except Exception as e:
800
+ pass
801
+ connection.commit() # 提交事务
802
+ connection.close()
929
803
 
930
- # 动态转义列名生成 SQL 查询字段
931
- escaped_update_col = [f'`{col}`' for col in raw_update_col]
932
- sql = f"""SELECT {','.join(escaped_update_col)} FROM `{table_name}` WHERE {' AND '.join(condition_parts)}"""
933
- cursor.execute(sql, condition_params)
934
- results = cursor.fetchall()
935
804
 
936
- if results:
937
- for result in results:
938
- change_col = []
939
- change_placeholders = []
940
- set_params = []
941
- for raw_col in raw_update_col:
942
- # 使用原始列名访问数据
943
- df_value = str(dict_data[raw_col])
944
- mysql_value = str(result[raw_col])
805
+ class OptimizeDatas:
806
+ """
807
+ 数据维护 删除 mysql 的冗余数据
808
+ 更新过程:
809
+ 1. 读取所有数据表
810
+ 2. 遍历表, 遍历列, 如果存在日期列则按天遍历所有日期, 不存在则全表读取
811
+ 3. 按天删除所有冗余数据(存在日期列时)
812
+ tips: 查找冗余数据的方式是创建一个临时迭代器, 逐行读取数据并添加到迭代器, 出现重复时将重复数据的 id 添加到临时列表, 按列表 id 执行删除
813
+ """
814
+ def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
815
+ self.username = username
816
+ self.password = password
817
+ self.host = host
818
+ self.port = port # 默认端口, 此后可能更新,不作为必传参数
819
+ self.charset = charset
820
+ self.config = {
821
+ 'host': self.host,
822
+ 'port': int(self.port),
823
+ 'user': self.username,
824
+ 'password': self.password,
825
+ 'charset': self.charset, # utf8mb4 支持存储四字节的UTF-8字符集
826
+ 'cursorclass': pymysql.cursors.DictCursor,
827
+ }
828
+ self.db_name_lists: list = [] # 更新多个数据库 删除重复数据
829
+ self.db_name = None
830
+ self.days: int = 63 # 对近 N 天的数据进行排重
831
+ self.end_date = None
832
+ self.start_date = None
833
+ self.connection = None
945
834
 
946
- # 清理小数点后多余的零
947
- if '.' in df_value:
948
- df_value = re.sub(r'0+$', '', df_value).rstrip('.')
949
- if '.' in mysql_value:
950
- mysql_value = re.sub(r'0+$', '', mysql_value).rstrip('.')
835
+ @staticmethod
836
+ def try_except(func): # 在类内部定义一个异常处理方法
951
837
 
952
- if df_value != mysql_value:
953
- change_placeholders.append(f"`{raw_col}` = %s") # 动态转义列名
954
- set_params.append(dict_data[raw_col])
955
- change_col.append(raw_col)
838
+ @wraps(func)
839
+ def wrapper(*args, **kwargs):
840
+ try:
841
+ return func(*args, **kwargs)
842
+ except Exception as e:
843
+ logger.error(f'{func.__name__}, {e}') # 将异常信息返回
956
844
 
957
- if change_placeholders:
958
- full_params = set_params + condition_params
959
- sql = f"""UPDATE `{table_name}`
960
- SET {','.join(change_placeholders)}
961
- WHERE {' AND '.join(condition_parts)}"""
962
- cursor.execute(sql, full_params)
963
- else: # 没有数据返回,则直接插入数据
964
- # 参数化插入语句
965
- keys = [f"`{k}`" for k in dict_data.keys()]
966
- placeholders = ','.join(['%s'] * len(dict_data))
967
- update_clause = ','.join([f"`{k}`=VALUES(`{k}`)" for k in dict_data.keys()])
968
- sql = f"""INSERT INTO `{table_name}` ({','.join(keys)}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause}"""
969
- cursor.execute(sql, tuple(dict_data.values()))
970
- connection.commit() # 提交数据库
971
- connection.close()
972
- return
845
+ return wrapper
973
846
 
974
- # 常规插入处理(参数化)
975
- keys = [f"`{k}`" for k in dict_data.keys()]
976
- placeholders = ','.join(['%s'] * len(dict_data))
977
- update_clause = ','.join([f"`{k}`=VALUES(`{k}`)" for k in dict_data.keys()])
978
- sql = f"""INSERT INTO `{table_name}` ({','.join(keys)}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause}"""
979
- cursor.execute(sql, tuple(dict_data.values()))
980
- connection.commit()
981
- connection.close()
847
+ def keep_connect(self, _db_name, _config, max_try: int=10):
848
+ attempts = 1
849
+ while attempts <= max_try:
850
+ try:
851
+ connection = pymysql.connect(**_config) # 连接数据库
852
+ return connection
853
+ except Exception as e:
854
+ logger.error(f'{_db_name}连接失败,正在重试: {self.host}:{self.port} {attempts}/{max_try} {e}')
855
+ attempts += 1
856
+ time.sleep(30)
857
+ logger.error(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
858
+ return None
982
859
 
983
- def cover_dict_dtypes(self, dict_data):
984
- """ 清理字典键值 并转换数据类型 """
985
- if not dict_data:
986
- logger.info(f'mysql.py -> MysqlUpload -> cover_dict_dtypes -> 传入的字典不能为空')
860
+ def optimize_list(self):
861
+ """
862
+ 更新多个数据库 移除冗余数据
863
+ 需要设置 self.db_name_lists
864
+ """
865
+ if not self.db_name_lists:
866
+ logger.info(f'尚未设置参数: self.db_name_lists')
987
867
  return
988
- __res_dict = {}
989
- new_dict_data = {}
990
- for k, v in dict_data.items():
991
- k = str(k).lower()
992
- k = re.sub(r'[()\-,,$&~^、 ()\"\'“”=·/。》《><!!`]', '_', k, re.IGNORECASE)
993
- k = k.replace(')', '')
994
- k = re.sub(r'_{2,}', '_', k)
995
- k = re.sub(r'_+$', '', k)
996
- if str(v) == '':
997
- v = 0
998
- v = str(v)
999
- v = re.sub('^="|"$', '', v, re.I)
1000
- v = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', str(v)) # 移除控制字符
1001
- if re.findall(r'^[-+]?\d+\.?\d*%$', v):
1002
- v = str(float(v.rstrip("%")) / 100)
868
+ for db_name in self.db_name_lists:
869
+ self.db_name = db_name
870
+ self.optimize()
1003
871
 
1004
- result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
1005
- result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
1006
- result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
1007
- result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
872
+ def optimize(self, except_key=['更新时间']):
873
+ """ 更新一个数据库 移除冗余数据 """
874
+ if not self.db_name:
875
+ logger.info(f'尚未设置参数: self.db_name')
876
+ return
877
+ tables = self.table_list(db_name=self.db_name)
878
+ if not tables:
879
+ logger.info(f'{self.db_name} -> 数据表不存在')
880
+ return
1008
881
 
1009
- date_type = otk.is_valid_date(v) # 判断日期时间
1010
- int_num = otk.is_integer(v) # 判断整数
1011
- count_int, count_float = count_decimal_places(v) # 判断小数,返回小数位数
1012
- if result1: # 京东sku/spu商品信息
1013
- __res_dict.update({k: 'varchar(100)'})
1014
- elif k == '日期':
1015
- __res_dict.update({k: 'DATE'})
1016
- elif k == '更新时间':
1017
- __res_dict.update({k: 'TIMESTAMP'})
1018
- elif result2: # 小数
1019
- __res_dict.update({k: 'decimal(10,4)'})
1020
- elif date_type == 1: # 纯日期
1021
- __res_dict.update({k: 'DATE'})
1022
- elif date_type == 2: # 日期+时间
1023
- __res_dict.update({k: 'DATETIME'})
1024
- elif int_num:
1025
- __res_dict.update({k: 'INT'})
1026
- elif count_float > 0:
1027
- if count_int + count_float > 10:
1028
- # if count_float > 5:
1029
- # v = round(float(v), 4)
1030
- if count_float >= 6:
1031
- __res_dict.update({k: 'decimal(14,6)'})
1032
- else:
1033
- __res_dict.update({k: 'decimal(14,4)'})
1034
- elif count_float >= 6:
1035
- __res_dict.update({k: 'decimal(14,6)'})
1036
- elif count_float >= 4:
1037
- __res_dict.update({k: 'decimal(12,4)'})
1038
- else:
1039
- __res_dict.update({k: 'decimal(10,2)'})
1040
- else:
1041
- __res_dict.update({k: 'varchar(255)'})
1042
- new_dict_data.update({k: v})
1043
- return __res_dict, new_dict_data
882
+ # 日期初始化
883
+ if not self.end_date:
884
+ self.end_date = pd.to_datetime(datetime.datetime.today())
885
+ else:
886
+ self.end_date = pd.to_datetime(self.end_date)
887
+ if self.days:
888
+ self.start_date = pd.to_datetime(self.end_date - datetime.timedelta(days=self.days))
889
+ if not self.start_date:
890
+ self.start_date = self.end_date
891
+ else:
892
+ self.start_date = pd.to_datetime(self.start_date)
893
+ start_date_before = self.start_date
894
+ end_date_before = self.end_date
1044
895
 
1045
- def convert_df_dtypes(self, df: pd.DataFrame):
1046
- """ 清理 df 的值和列名,并转换数据类型 """
1047
- df = otk.cover_df(df=df) # 清理 df 的值和列名
1048
- [pd.to_numeric(df[col], errors='ignore') for col in df.columns.tolist()]
1049
- dtypes = df.dtypes.to_dict()
1050
- __res_dict = {}
1051
- for k, v in dtypes.copy().items():
1052
- result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
1053
- result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
1054
- result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
1055
- result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
896
+ logger.info(f'mysql({self.host}: {self.port}) {self.db_name} 数据库优化中(日期长度: {self.days} 天)...')
897
+ for table_dict in tables:
898
+ for key, table_name in table_dict.items():
899
+ self.config.update({'database': self.db_name}) # 添加更新 config 字段
900
+ self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
901
+ if not self.connection:
902
+ return
903
+ with self.connection.cursor() as cursor:
904
+ sql = f"SELECT 1 FROM `{table_name}` LIMIT 1"
905
+ cursor.execute(sql)
906
+ result = cursor.fetchone()
907
+ if not result:
908
+ logger.info(f'数据表: {table_name}, 数据长度为 0')
909
+ continue # 检查数据表是否为空
1056
910
 
1057
- if result1: # id/sku/spu商品信息
1058
- __res_dict.update({k: 'varchar(50)'})
1059
- elif result2: # 小数
1060
- __res_dict.update({k: 'decimal(10,4)'})
1061
- elif result3: # 小数
1062
- __res_dict.update({k: 'decimal(12,4)'})
1063
- elif result4: # 小数
1064
- __res_dict.update({k: 'decimal(12,2)'})
1065
- elif k == '日期':
1066
- __res_dict.update({k: 'date'})
1067
- elif k == '更新时间':
1068
- __res_dict.update({k: 'timestamp'})
1069
- elif v == 'int64':
1070
- __res_dict.update({k: 'int'})
1071
- elif v == 'float64':
1072
- __res_dict.update({k: 'decimal(10,4)'})
1073
- elif v == 'bool':
1074
- __res_dict.update({k: 'boolean'})
1075
- elif v == 'datetime64[ns]':
1076
- __res_dict.update({k: 'datetime'})
1077
- else:
1078
- __res_dict.update({k: 'varchar(255)'})
1079
- return __res_dict, df
911
+ cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`") # 查询数据表的列信息
912
+ columns = cursor.fetchall()
913
+ date_exist = False
914
+ for col in columns: # 遍历列信息,检查是否存在类型为日期的列
915
+ if col['Field'] == '日期' and (col['Type'] == 'date' or col['Type'].startswith('datetime')):
916
+ date_exist = True
917
+ break
918
+ if date_exist: # 存在日期列
919
+ sql_max = f"SELECT MAX(日期) AS max_date FROM `{table_name}`"
920
+ sql_min = f"SELECT MIN(日期) AS min_date FROM `{table_name}`"
921
+ cursor.execute(sql_max)
922
+ max_result = cursor.fetchone()
923
+ cursor.execute(sql_min)
924
+ min_result = cursor.fetchone()
925
+ # 匹配修改为合适的起始和结束日期
926
+ if self.start_date < pd.to_datetime(min_result['min_date']):
927
+ self.start_date = pd.to_datetime(min_result['min_date'])
928
+ if self.end_date > pd.to_datetime(max_result['max_date']):
929
+ self.end_date = pd.to_datetime(max_result['max_date'])
930
+ dates_list = self.day_list(start_date=self.start_date, end_date=self.end_date)
931
+ # dates_list 是日期列表
932
+ for date in dates_list:
933
+ self.delete_duplicate(table_name=table_name, date=date, except_key=except_key)
934
+ self.start_date = start_date_before # 重置,不然日期错乱
935
+ self.end_date = end_date_before
936
+ else: # 不存在日期列的情况
937
+ self.delete_duplicate2(table_name=table_name, except_key=except_key)
938
+ self.connection.close()
939
+ logger.info(f'mysql({self.host}: {self.port}) {self.db_name} 数据库优化完成!')
1080
940
 
1081
- @try_except
1082
- def df_to_mysql(self, df, db_name, table_name, set_typ=None, icm_update=[], move_insert=False, df_sql=False,
1083
- filename=None, count=None, allow_not_null=False, cut_data=None):
941
+ def delete_duplicate(self, table_name, date, except_key=['更新时间']):
942
+ datas = self.table_datas(db_name=self.db_name, table_name=str(table_name), date=date)
943
+ if not datas:
944
+ return
945
+ duplicate_id = [] # 出现重复的 id
946
+ all_datas = [] # 迭代器
947
+ for data in datas:
948
+ for e_key in except_key:
949
+ if e_key in data.keys(): # 在检查重复数据时,不包含 更新时间 字段
950
+ del data[e_key]
951
+ try:
952
+ delete_id = data['id']
953
+ del data['id']
954
+ data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
955
+ if data in all_datas: # 数据出现重复时
956
+ if delete_id:
957
+ duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
958
+ continue
959
+ all_datas.append(data) # 数据没有重复
960
+ except Exception as e:
961
+ logger.debug(f'{table_name} 函数: mysql - > OptimizeDatas -> delete_duplicate -> {e}')
962
+ del all_datas
963
+
964
+ if not duplicate_id: # 如果没有重复数据,则跳过该数据表
965
+ return
966
+
967
+ try:
968
+ with self.connection.cursor() as cursor:
969
+ placeholders = ', '.join(['%s'] * len(duplicate_id))
970
+ # 移除冗余数据
971
+ sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
972
+ cursor.execute(sql, duplicate_id)
973
+ logger.debug(f"{table_name} -> {date.strftime('%Y-%m-%d')} before: {len(datas)}, remove: {cursor.rowcount}")
974
+ self.connection.commit() # 提交事务
975
+ except Exception as e:
976
+ logger.error(f'{self.db_name}/{table_name}, {e}')
977
+ self.connection.rollback() # 异常则回滚
978
+
979
+ def delete_duplicate2(self, table_name, except_key=['更新时间']):
980
+ with self.connection.cursor() as cursor:
981
+ sql = f"SELECT * FROM `{table_name}`" # 如果不包含日期列,则获取全部数据
982
+ cursor.execute(sql)
983
+ datas = cursor.fetchall()
984
+ if not datas:
985
+ return
986
+ duplicate_id = [] # 出现重复的 id
987
+ all_datas = [] # 迭代器
988
+ for data in datas:
989
+ for e_key in except_key:
990
+ if e_key in data.keys(): # 在检查重复数据时,不包含 更新时间 字段
991
+ del data[e_key]
992
+ delete_id = data['id']
993
+ del data['id']
994
+ data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
995
+ if data in all_datas: # 数据出现重复时
996
+ duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
997
+ continue
998
+ all_datas.append(data) # 数据没有重复
999
+ del all_datas
1000
+
1001
+ if not duplicate_id: # 如果没有重复数据,则跳过该数据表
1002
+ return
1003
+
1004
+ try:
1005
+ with self.connection.cursor() as cursor:
1006
+ placeholders = ', '.join(['%s'] * len(duplicate_id))
1007
+ # 移除冗余数据
1008
+ sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
1009
+ cursor.execute(sql, duplicate_id)
1010
+ logger.info(f"{table_name} -> before: {len(datas)}, "
1011
+ f"remove: {cursor.rowcount}")
1012
+ self.connection.commit() # 提交事务
1013
+ except Exception as e:
1014
+ logger.error(f'{self.db_name}/{table_name}, {e}')
1015
+ self.connection.rollback() # 异常则回滚
1016
+
1017
+ def database_list(self):
1018
+ """ 获取所有数据库 """
1019
+ connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
1020
+ if not connection:
1021
+ return
1022
+ with connection.cursor() as cursor:
1023
+ cursor.execute("SHOW DATABASES")
1024
+ databases = cursor.fetchall() # 获取所有数据库的结果
1025
+ connection.close()
1026
+ return databases
1027
+
1028
+ def table_list(self, db_name):
1029
+ """ 获取指定数据库的所有数据表 """
1030
+ connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
1031
+ if not connection:
1032
+ return
1033
+ try:
1034
+ with connection.cursor() as cursor:
1035
+ cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
1036
+ database_exists = cursor.fetchone()
1037
+ if not database_exists:
1038
+ logger.info(f'{db_name}: 数据表不存在!')
1039
+ return
1040
+ except Exception as e:
1041
+ logger.error(f'002 {e}')
1042
+ return
1043
+ finally:
1044
+ connection.close() # 断开连接
1045
+
1046
+ self.config.update({'database': db_name}) # 添加更新 config 字段
1047
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
1048
+ if not connection:
1049
+ return
1050
+ with connection.cursor() as cursor:
1051
+ cursor.execute("SHOW TABLES")
1052
+ tables = cursor.fetchall() # 获取所有数据表
1053
+ connection.close()
1054
+ return tables
1055
+
1056
+ def table_datas(self, db_name, table_name, date):
1084
1057
  """
1085
- db_name: 数据库名
1086
- table_name: 表名
1087
- move_insert: 根据df 的日期,先移除数据库数据,再插入, df_sql, icm_update 都要设置为 False
1088
- 原则上只限于聚合数据使用,原始数据插入时不要设置
1089
- df_sql: 这是一个临时参数, 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重,初创表大量上传数据的时候使用
1090
- icm_update: 增量更新, 在聚合数据中使用,原始文件不要使用
1091
- 使用增量更新: 必须确保 icm_update 传进来的列必须是数据表中唯一主键,值不会发生变化,不会重复,否则可能产生错乱覆盖情况
1092
- filename: 用来追踪处理进度,传这个参数是方便定位产生错误的文件
1093
- allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
1058
+ 获取指定数据表的数据, 按天获取
1094
1059
  """
1095
- if not self.config:
1060
+ self.config.update({'database': db_name}) # 添加更新 config 字段
1061
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
1062
+ if not connection:
1096
1063
  return
1097
- if icm_update:
1098
- if move_insert or df_sql:
1099
- logger.info(f'icm_update/move_insert/df_sql 参数不能同时设定')
1100
- return
1101
- if move_insert:
1102
- if icm_update or df_sql:
1103
- logger.info(f'icm_update/move_insert/df_sql 参数不能同时设定')
1104
- return
1064
+ try:
1065
+ with connection.cursor() as cursor:
1066
+ sql = f"SELECT * FROM `{table_name}` WHERE {'日期'} BETWEEN '%s' AND '%s'" % (date, date)
1067
+ cursor.execute(sql)
1068
+ results = cursor.fetchall()
1069
+ except Exception as e:
1070
+ logger.error(f'001 {e}')
1071
+ finally:
1072
+ connection.close()
1073
+ return results
1074
+
1075
+ def day_list(self, start_date, end_date):
1076
+ start_date = pd.to_datetime(start_date)
1077
+ end_date = pd.to_datetime(end_date)
1078
+ date_list = []
1079
+ while start_date <= end_date:
1080
+ date_list.append(pd.to_datetime(start_date.date()))
1081
+ start_date += datetime.timedelta(days=1)
1082
+ return date_list
1083
+
1084
+ def rename_column(self):
1085
+ """ 批量修改数据库的列名 """
1086
+ """
1087
+ # for db_name in ['京东数据2', '推广数据2', '市场数据2', '生意参谋2', '生意经2', '属性设置2',]:
1088
+ # s = OptimizeDatas(username=username, password=password, host=host, port=port)
1089
+ # s.db_name = db_name
1090
+ # s.rename_column()
1091
+ """
1092
+ tables = self.table_list(db_name=self.db_name)
1093
+ for table_dict in tables:
1094
+ for key, table_name in table_dict.items():
1095
+ self.config.update({'database': self.db_name}) # 添加更新 config 字段
1096
+ self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
1097
+ if not self.connection:
1098
+ return
1099
+ with self.connection.cursor() as cursor:
1100
+ cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`") # 查询数据表的列信息
1101
+ columns = cursor.fetchall()
1102
+ columns = [{column['Field']: column['Type']} for column in columns]
1103
+ for column in columns:
1104
+ for key, value in column.items():
1105
+ if key.endswith('_'):
1106
+ new_name = re.sub(r'_+$', '', key)
1107
+ sql = f"ALTER TABLE `{table_name}` CHANGE COLUMN {key} {new_name} {value}"
1108
+ cursor.execute(sql)
1109
+ self.connection.commit()
1110
+ if self.connection:
1111
+ self.connection.close()
1112
+
1113
+
1114
+ class MySQLUploader:
1115
+ def __init__(
1116
+ self,
1117
+ username: str,
1118
+ password: str,
1119
+ host: str = 'localhost',
1120
+ port: int = 3306,
1121
+ charset: str = 'utf8mb4',
1122
+ collation: str = 'utf8mb4_0900_ai_ci',
1123
+ logging_mode: str = 'console', # 'both'(控制台+文件), 'console'(仅控制台), 'file'(仅文件), 'none'(禁用)
1124
+ log_level: str = 'INFO', # 默认日志级别
1125
+ log_file: str = 'mysql_upload.log', # 日志文件路径
1126
+ max_log_size: int = 50, # 日志文件大小(MB)
1127
+ backup_count: int = 5, # 保留的日志文件数量
1128
+ max_retries: int = 10,
1129
+ retry_interval: int = 10,
1130
+ pool_size: int = 10,
1131
+ connect_timeout: int = 10,
1132
+ read_timeout: int = 30,
1133
+ write_timeout: int = 30,
1134
+ ssl: Optional[Dict] = None,
1135
+ enable_metrics: bool = True # 是否启用性能指标收集
1136
+ ):
1137
+ """
1138
+ :param username: 数据库用户名
1139
+ :param password: 数据库密码
1140
+ :param host: 数据库主机地址,默认为localhost
1141
+ :param port: 数据库端口,默认为3306
1142
+ :param charset: 字符集,默认为utf8mb4
1143
+ :param collation: 排序规则,默认为utf8mb4_0900_ai_ci
1144
+ :param logging_mode: 日志模式,可选 'both'(控制台+文件), 'console'(仅控制台), 'file'(仅文件), 'none'(禁用)
1145
+ :param log_level: 日志级别,默认为INFO
1146
+ :param log_file: 日志文件路径
1147
+ :param max_log_size: 日志文件最大大小(MB),默认为50
1148
+ :param backup_count: 保留的日志备份数量,默认为5
1149
+ :param max_retries: 最大重试次数,默认为10
1150
+ :param retry_interval: 重试间隔(秒),默认为10
1151
+ :param pool_size: 连接池大小,默认为5
1152
+ :param connect_timeout: 连接超时(秒),默认为10
1153
+ :param read_timeout: 读取超时(秒),默认为30
1154
+ :param write_timeout: 写入超时(秒),默认为30
1155
+ :param ssl: SSL配置字典,默认为None
1156
+ :param enable_metrics: 是否启用性能指标收集,默认为True
1157
+ """
1158
+ self.username = username
1159
+ self.password = password
1160
+ self.host = host
1161
+ self.port = port
1162
+ self.charset = charset
1163
+ self.collation = collation
1164
+ self.max_retries = max(max_retries, 1)
1165
+ self.retry_interval = max(retry_interval, 1)
1166
+ self.pool_size = max(pool_size, 1)
1167
+ self.connect_timeout = connect_timeout
1168
+ self.read_timeout = read_timeout
1169
+ self.write_timeout = write_timeout
1170
+ self.ssl = ssl
1171
+ self._prepared_statements = {}
1172
+ self._max_cached_statements = 100
1173
+ self.enable_metrics = enable_metrics
1174
+ self.metrics = {
1175
+ 'total_uploads': 0,
1176
+ 'successful_uploads': 0,
1177
+ 'failed_uploads': 0,
1178
+ 'total_rows': 0,
1179
+ 'successful_rows': 0,
1180
+ 'failed_rows': 0,
1181
+ 'total_retries': 0,
1182
+ 'total_execution_time': 0.0,
1183
+ 'connection_usage': [],
1184
+ 'memory_usage': [],
1185
+ 'cpu_usage': []
1186
+ }
1187
+
1188
+ # 初始化日志系统
1189
+ self._init_logging(logging_mode, log_level, log_file, max_log_size, backup_count)
1190
+
1191
+ # 创建连接池
1192
+ self.pool = self._create_connection_pool()
1193
+
1194
+ def _init_logging(
1195
+ self,
1196
+ logging_mode: str,
1197
+ log_level: str,
1198
+ log_file: str,
1199
+ max_log_size: int,
1200
+ backup_count: int
1201
+ ):
1202
+ """初始化结构化日志配置"""
1203
+ if logging_mode.lower() == 'none':
1204
+ self.logger = None
1205
+ return
1206
+
1207
+ valid_levels = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
1208
+ level = log_level.upper() if log_level.upper() in valid_levels else 'INFO'
1209
+
1210
+ # 创建格式化器 - 使用结构化JSON格式
1211
+ class StructuredFormatter(logging.Formatter):
1212
+ def format(self, record):
1213
+ log_data = {
1214
+ 'time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
1215
+ 'level': record.levelname,
1216
+ 'message': record.getMessage(),
1217
+ # 'logger': record.name,
1218
+ 'module': record.module,
1219
+ 'line': record.lineno,
1220
+ # 'process': record.process
1221
+ }
1222
+
1223
+ # 添加异常信息
1224
+ if record.exc_info:
1225
+ log_data['exception'] = self.formatException(record.exc_info)
1226
+
1227
+ return json.dumps(log_data, ensure_ascii=False)
1228
+
1229
+ # 创建日志记录器
1230
+ self.logger = logging.getLogger('upload')
1231
+ self.logger.setLevel(level)
1232
+
1233
+ # 防止重复添加handler
1234
+ if self.logger.handlers:
1235
+ for handler in self.logger.handlers[:]:
1236
+ self.logger.removeHandler(handler)
1237
+
1238
+ formatter = StructuredFormatter()
1239
+ mode = logging_mode.lower()
1240
+
1241
+ # 根据模式添加相应的handler
1242
+ if mode in ('both', 'console'):
1243
+ console_handler = logging.StreamHandler()
1244
+ console_handler.setFormatter(formatter)
1245
+ self.logger.addHandler(console_handler)
1246
+
1247
+ if mode in ('both', 'file'):
1248
+ file_handler = logging.handlers.RotatingFileHandler(
1249
+ filename=log_file,
1250
+ maxBytes=max_log_size * 1024 * 1024,
1251
+ backupCount=backup_count,
1252
+ encoding='utf-8'
1253
+ )
1254
+ file_handler.setFormatter(formatter)
1255
+ self.logger.addHandler(file_handler)
1105
1256
 
1106
- self.filename = filename
1107
- if isinstance(df, pd.DataFrame):
1108
- if len(df) == 0:
1109
- logger.info(f'{db_name}: {table_name} 传入的 df 数据长度为0, {self.filename}')
1110
- return
1111
- else:
1112
- logger.info(f'{db_name}: {table_name} 传入的 df 不是有效的 dataframe 结构, {self.filename}')
1257
+ def _record_metrics(self, metric_name: str, value: Any = 1, is_timing: bool = False):
1258
+ """记录性能指标"""
1259
+ if not self.enable_metrics:
1113
1260
  return
1114
- if not db_name or db_name == 'None':
1115
- logger.info(f'{db_name} 不能为 None')
1261
+
1262
+ if metric_name not in self.metrics:
1263
+ self.metrics[metric_name] = []
1264
+
1265
+ if is_timing:
1266
+ # 如果是时间指标,记录时间戳和值
1267
+ self.metrics[metric_name].append({
1268
+ 'timestamp': datetime.datetime.now().isoformat(),
1269
+ 'value': value
1270
+ })
1271
+ else:
1272
+ # 其他指标直接累加
1273
+ if isinstance(self.metrics[metric_name], (int, float)):
1274
+ self.metrics[metric_name] += value
1275
+ elif isinstance(self.metrics[metric_name], list):
1276
+ self.metrics[metric_name].append({
1277
+ 'timestamp': datetime.datetime.now().isoformat(),
1278
+ 'value': value
1279
+ })
1280
+
1281
+ def _get_system_metrics(self):
1282
+ """获取系统资源使用指标"""
1283
+ if not self.enable_metrics:
1284
+ return {}
1285
+
1286
+ metrics = {
1287
+ 'memory': psutil.virtual_memory().percent,
1288
+ 'cpu': psutil.cpu_percent(),
1289
+ }
1290
+
1291
+ # 更安全的连接数获取方式
1292
+ if hasattr(self, 'pool') and self.pool is not None:
1293
+ try:
1294
+ # 对于不同的连接池实现可能有不同的属性名
1295
+ if hasattr(self.pool, '_connections'):
1296
+ connections = self.pool._connections
1297
+ metrics['connections'] = len(connections) if hasattr(connections, '__len__') else 0
1298
+ else:
1299
+ metrics['connections'] = 0
1300
+ except Exception:
1301
+ metrics['connections'] = 0
1302
+ else:
1303
+ metrics['connections'] = 0
1304
+
1305
+ return metrics
1306
+
1307
+ def _log_with_metrics(self, level: str, message: str, extra: Optional[Dict] = None):
1308
+ """日志记录"""
1309
+ if not self.logger:
1116
1310
  return
1117
1311
 
1118
- if cut_data:
1119
- if '日期' in df.columns.tolist():
1312
+ # 记录系统指标
1313
+ metrics = self._get_system_metrics()
1314
+ log_extra = {'metrics': metrics}
1315
+ if extra:
1316
+ log_extra.update(extra)
1317
+
1318
+ getattr(self.logger, level.lower())(message, extra={'extra_data': log_extra})
1319
+
1320
+ def _create_connection_pool(self) -> PooledDB:
1321
+ """创建数据库连接池"""
1322
+ start_time = time.time()
1323
+ self.pool = None
1324
+
1325
+ pool_params = {
1326
+ 'creator': pymysql,
1327
+ 'host': self.host,
1328
+ 'port': self.port,
1329
+ 'user': self.username,
1330
+ 'password': self.password,
1331
+ 'charset': self.charset,
1332
+ 'cursorclass': pymysql.cursors.DictCursor,
1333
+ 'maxconnections': self.pool_size,
1334
+ 'ping': 7,
1335
+ 'connect_timeout': self.connect_timeout,
1336
+ 'read_timeout': self.read_timeout,
1337
+ 'write_timeout': self.write_timeout,
1338
+ 'autocommit': False
1339
+ }
1340
+
1341
+ if self.ssl:
1342
+ required_keys = {'ca', 'cert', 'key'}
1343
+ if not all(k in self.ssl for k in required_keys):
1344
+ error_msg = "SSL配置必须包含ca、cert和key"
1345
+ self._log_with_metrics('error', error_msg)
1346
+ raise ValueError(error_msg)
1347
+ pool_params['ssl'] = {
1348
+ 'ca': self.ssl['ca'],
1349
+ 'cert': self.ssl['cert'],
1350
+ 'key': self.ssl['key'],
1351
+ 'check_hostname': self.ssl.get('check_hostname', False)
1352
+ }
1353
+
1354
+ try:
1355
+ pool = PooledDB(**pool_params)
1356
+ elapsed = time.time() - start_time
1357
+ self._record_metrics('connection_pool_creation_time', elapsed, is_timing=True)
1358
+ self._log_with_metrics('info', "连接池创建成功", {
1359
+ 'pool_size': self.pool_size,
1360
+ 'time_elapsed': elapsed
1361
+ })
1362
+ return pool
1363
+ except Exception as e:
1364
+ elapsed = time.time() - start_time
1365
+ self._record_metrics('connection_pool_failures', 1)
1366
+ self.pool = None
1367
+ self._log_with_metrics('error', "连接池创建失败", {
1368
+ 'error': str(e),
1369
+ 'time_elapsed': elapsed
1370
+ })
1371
+ raise ConnectionError(f"连接池创建失败: {str(e)}")
1372
+
1373
+ def _execute_with_retry(self, func):
1374
+ @wraps(func)
1375
+ def wrapper(*args, **kwargs):
1376
+ last_exception = None
1377
+ start_time = time.time()
1378
+ operation = func.__name__
1379
+
1380
+ self._log_with_metrics('debug', f"开始执行操作: {operation}", {
1381
+ 'attempt': 1,
1382
+ 'max_retries': self.max_retries
1383
+ })
1384
+
1385
+ for attempt in range(self.max_retries):
1120
1386
  try:
1121
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
1122
- min_year = df['日期'].min(skipna=True).year
1123
- min_month = df['日期'].min(skipna=True).month
1124
- if 0 < int(min_month) < 10 and not str(min_month).startswith('0'):
1125
- min_month = f'0{min_month}'
1126
- if str(cut_data).lower() == 'year':
1127
- table_name = f'{table_name}_{min_year}'
1128
- elif str(cut_data).lower() == 'month':
1129
- table_name = f'{table_name}_{min_year}-{min_month}'
1387
+ result = func(*args, **kwargs)
1388
+ elapsed = time.time() - start_time
1389
+
1390
+ if attempt > 0:
1391
+ self._record_metrics('total_retries', attempt)
1392
+ self._log_with_metrics('info', "操作成功(重试后)", {
1393
+ 'operation': operation,
1394
+ 'attempts': attempt + 1,
1395
+ 'time_elapsed': elapsed
1396
+ })
1130
1397
  else:
1131
- logger.info(f'参数不正确,cut_data应为 year 或 month ')
1398
+ self._log_with_metrics('debug', "操作成功", {
1399
+ 'operation': operation,
1400
+ 'time_elapsed': elapsed
1401
+ })
1402
+
1403
+ return result
1404
+
1405
+ except (pymysql.OperationalError, pymysql.err.MySQLError) as e:
1406
+ last_exception = e
1407
+ self._record_metrics('database_errors', 1)
1408
+
1409
+ # 记录详细的MySQL错误信息
1410
+ error_details = {
1411
+ 'operation': operation,
1412
+ 'error_code': e.args[0] if e.args else None,
1413
+ 'error_message': e.args[1] if len(e.args) > 1 else None,
1414
+ 'attempt': attempt + 1,
1415
+ 'max_retries': self.max_retries
1416
+ }
1417
+
1418
+ if attempt < self.max_retries - 1:
1419
+ wait_time = self.retry_interval * (attempt + 1)
1420
+ error_details['wait_time'] = wait_time
1421
+ self._log_with_metrics('warning', "数据库操作失败,准备重试", error_details)
1422
+ time.sleep(wait_time)
1423
+
1424
+ # 尝试重新连接
1425
+ try:
1426
+ self.pool = self._create_connection_pool()
1427
+ self._log_with_metrics('info', "成功重新建立数据库连接")
1428
+ except Exception as reconnect_error:
1429
+ self._log_with_metrics('error', "重连失败", {
1430
+ 'error': str(reconnect_error)
1431
+ })
1432
+ else:
1433
+ elapsed = time.time() - start_time
1434
+ error_details['time_elapsed'] = elapsed
1435
+ self._log_with_metrics('error', "操作最终失败", error_details)
1436
+
1437
+ except pymysql.IntegrityError as e:
1438
+ elapsed = time.time() - start_time
1439
+ self._record_metrics('integrity_errors', 1)
1440
+ self._log_with_metrics('error', "完整性约束错误", {
1441
+ 'operation': operation,
1442
+ 'time_elapsed': elapsed,
1443
+ 'error_code': e.args[0] if e.args else None,
1444
+ 'error_message': e.args[1] if len(e.args) > 1 else None
1445
+ })
1446
+ raise e
1447
+
1132
1448
  except Exception as e:
1133
- logger.error(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
1134
- # 清理 dataframe 非法值,并转换获取数据类型
1135
- dtypes, df = self.convert_df_dtypes(df)
1136
- if set_typ:
1137
- # 更新自定义的列数据类型
1138
- for k, v in dtypes.copy().items():
1139
- # 确保传进来的 set_typ 键存在于实际的 df 列才 update
1140
- [dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
1449
+ last_exception = e
1450
+ elapsed = time.time() - start_time
1451
+ self._record_metrics('unexpected_errors', 1)
1452
+ self._log_with_metrics('error', "发生意外错误", {
1453
+ 'operation': operation,
1454
+ 'time_elapsed': elapsed,
1455
+ 'error_type': type(e).__name__,
1456
+ 'error_message': str(e),
1457
+ 'error_args': e.args if hasattr(e, 'args') else None
1458
+ })
1459
+ break
1460
+
1461
+ raise last_exception if last_exception else Exception("发生未知错误")
1141
1462
 
1142
- connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
1143
- if not connection:
1144
- return
1145
- with connection.cursor() as cursor:
1146
- cursor.execute("SHOW DATABASES LIKE %s", (db_name,)) # 检查数据库是否存在
1147
- database_exists = cursor.fetchone()
1148
- if not database_exists:
1149
- # 如果数据库不存在,则新建
1150
- sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
1151
- cursor.execute(sql)
1152
- connection.commit()
1153
- logger.info(f"创建Database: {db_name}")
1463
+ return wrapper
1154
1464
 
1155
- self.config.update({'database': db_name}) # 添加更新 config 字段
1156
- connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
1157
- if not connection:
1158
- return
1159
- with connection.cursor() as cursor:
1160
- # 1. 查询表, 不存在则创建一个空表
1161
- sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
1162
- cursor.execute(sql, (table_name,))
1163
- if not cursor.fetchone():
1164
- create_table_sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY)"
1165
- cursor.execute(create_table_sql)
1166
- logger.info(f'创建 mysql 表: {table_name}')
1465
+ def _get_connection(self):
1466
+ """从连接池获取连接"""
1467
+ try:
1468
+ conn = self.pool.connection()
1469
+ self._log_with_metrics('debug', "获取数据库连接")
1470
+ return conn
1471
+ except Exception as e:
1472
+ self._log_with_metrics("error", str(e))
1473
+ raise ConnectionError(f"连接数据库失败: {str(e)}")
1167
1474
 
1168
- # 有特殊字符不需转义
1169
- sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
1170
- cursor.execute(sql, (db_name, table_name))
1171
- col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()]
1172
- cols = df.columns.tolist()
1173
- col_not_exist = [col for col in cols if col not in col_exist]
1475
+ def _check_database_exists(self, db_name: str) -> bool:
1476
+ """检查数据库是否存在"""
1477
+ db_name = self._validate_identifier(db_name)
1478
+ sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
1174
1479
 
1175
- # 检查列,不存在则新建列
1176
- if col_not_exist: # 数据表中不存在的列
1177
- for col in col_not_exist:
1178
- # 创建列,需转义
1179
- alter_sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]}"
1180
- if not allow_not_null:
1181
- alter_sql += " NOT NULL"
1182
- cursor.execute(alter_sql)
1183
- logger.info(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
1480
+ try:
1481
+ with self._get_connection() as conn:
1482
+ with conn.cursor() as cursor:
1483
+ cursor.execute(sql, (db_name,))
1484
+ exists = bool(cursor.fetchone())
1485
+ self._log_with_metrics('debug', f"{db_name} 数据库已存在: {exists}")
1486
+ return exists
1487
+ except Exception as e:
1488
+ self._log_with_metrics('error', f"检查数据库是否存在时出错: {str(e)}")
1489
+ raise
1184
1490
 
1185
- # 创建索引
1186
- if col == '日期':
1187
- sql = f"SHOW INDEXES FROM `{table_name}` WHERE `Column_name` = %s"
1188
- cursor.execute(sql, (col,))
1189
- result = cursor.fetchone() # 检查索引是否存在
1190
- if not result:
1191
- cursor.execute(f"CREATE INDEX index_name ON `{table_name}`(`{col}`)")
1192
- connection.commit() # 提交事务
1491
+ def _create_database(self, db_name: str):
1492
+ """创建数据库"""
1493
+ db_name = self._validate_identifier(db_name)
1494
+ sql = f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET {self.charset} COLLATE {self.collation}"
1193
1495
 
1194
- if df_sql:
1195
- logger.info(f'正在更新: mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count}, {self.filename}')
1196
- engine = create_engine(
1197
- f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
1198
- df.to_sql(
1199
- name=table_name,
1200
- con=engine,
1201
- if_exists='append',
1202
- index=False,
1203
- chunksize=1000,
1204
- method='multi'
1205
- )
1206
- connection.commit() # 提交事务
1207
- connection.close()
1208
- return
1496
+ try:
1497
+ with self._get_connection() as conn:
1498
+ with conn.cursor() as cursor:
1499
+ cursor.execute(sql)
1500
+ conn.commit()
1501
+ self._log_with_metrics('info', f"{db_name} 数据库已创建")
1502
+ except Exception as e:
1503
+ self._log_with_metrics('error', f"{db_name}: 无法创建数据库 {str(e)}")
1504
+ conn.rollback()
1505
+ raise
1209
1506
 
1210
- # 5. 移除指定日期范围内的数据,原则上只限于聚合数据使用,原始数据插入时不要设置
1211
- if move_insert and '日期' in df.columns.tolist():
1212
- # 移除数据
1213
- dates = df['日期'].values.tolist()
1214
- dates = [pd.to_datetime(item) for item in dates] # 需要先转换类型才能用 min, max
1215
- start_date = pd.to_datetime(min(dates)).strftime('%Y-%m-%d')
1216
- end_date = (pd.to_datetime(max(dates)) + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
1507
+ def _get_partition_table_name(self, table_name: str, date_value: str, partition_by: str) -> str:
1508
+ """
1509
+ 获取分表名称
1217
1510
 
1218
- delete_sql = f"""
1219
- DELETE FROM `{table_name}`
1220
- WHERE 日期 BETWEEN %s AND %s
1221
- """
1222
- cursor.execute(delete_sql, (start_date, end_date))
1223
- connection.commit()
1511
+ :param table_name: 基础表名
1512
+ :param date_value: 日期值
1513
+ :param partition_by: 分表方式 ('year' 'month')
1514
+ :return: 分表名称
1515
+ :raises ValueError: 如果日期格式无效或分表方式无效
1516
+ """
1517
+ try:
1518
+ date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d %H:%M:%S')
1519
+ except ValueError:
1520
+ try:
1521
+ date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d')
1522
+ except ValueError:
1523
+ error_msg = f"无效的日期格式: {date_value}"
1524
+ self._log_with_metrics('error', error_msg)
1525
+ raise ValueError(error_msg)
1224
1526
 
1225
- # 插入数据
1226
- engine = create_engine(
1227
- f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
1228
- df.to_sql(
1229
- name=table_name,
1230
- con=engine,
1231
- if_exists='append',
1232
- index=False,
1233
- chunksize=1000,
1234
- method='multi'
1235
- )
1236
- return
1527
+ if partition_by == 'year':
1528
+ return f"{table_name}_{date_obj.year}"
1529
+ elif partition_by == 'month':
1530
+ return f"{table_name}_{date_obj.year}_{date_obj.month:02d}"
1531
+ else:
1532
+ error_msg = "partition_by must be 'year' or 'month'"
1533
+ self._log_with_metrics('error', error_msg)
1534
+ raise ValueError(error_msg)
1237
1535
 
1238
- datas = df.to_dict(orient='records')
1239
- for data in datas:
1240
- # data 是传进来待处理的数据, 不是数据库数据
1241
- # data 示例: {'日期': Timestamp('2024-08-27 00:00:00'), '推广费余额': 33299, '品销宝余额': 2930.73, '短信剩余': 67471}
1242
- try:
1243
- # 预处理数据:转换非字符串类型
1244
- processed_data = {}
1245
- for k, v in data.items():
1246
- if isinstance(v, (int, float)):
1247
- processed_data[k] = float(v)
1248
- elif isinstance(v, pd.Timestamp):
1249
- processed_data[k] = v.strftime('%Y-%m-%d')
1250
- else:
1251
- processed_data[k] = str(v)
1536
+ def _validate_identifier(self, identifier: str) -> str:
1537
+ """
1538
+ 验证并清理数据库标识符(数据库名、表名、列名)
1539
+ 防止SQL注入和非法字符
1252
1540
 
1253
- # 构建基础SQL要素
1254
- columns = [f'`{k}`' for k in processed_data.keys()]
1255
- placeholders = ', '.join(['%s'] * len(processed_data))
1256
- values = list(processed_data.values())
1541
+ :param identifier: 要验证的标识符
1542
+ :return: 清理后的安全标识符
1543
+ :raises ValueError: 如果标识符无效
1544
+ """
1545
+ if not identifier or not isinstance(identifier, str):
1546
+ error_msg = f"无效的标识符: {identifier}"
1547
+ self._log_with_metrics('error', error_msg)
1548
+ raise ValueError(error_msg)
1257
1549
 
1258
- # 构建基本INSERT语句
1259
- insert_sql = f"INSERT INTO `{table_name}` ({', '.join(columns)}) VALUES ({placeholders})"
1550
+ # 移除非法字符,只保留字母、数字、下划线和美元符号
1551
+ cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '', identifier)
1552
+ if not cleaned:
1553
+ error_msg = f"无法清理异常标识符: {identifier}"
1554
+ self._log_with_metrics('error', error_msg)
1555
+ raise ValueError(error_msg)
1556
+
1557
+ # 检查是否为MySQL保留字
1558
+ mysql_keywords = {
1559
+ 'select', 'insert', 'update', 'delete', 'from', 'where', 'and', 'or',
1560
+ 'not', 'like', 'in', 'is', 'null', 'true', 'false', 'between'
1561
+ }
1562
+ if cleaned.lower() in mysql_keywords:
1563
+ self._log_with_metrics('debug', f"存在MySQL保留字: {cleaned}")
1564
+ return f"`{cleaned}`"
1260
1565
 
1261
- if icm_update: # 增量更新, 专门用于聚合数据,其他库不要调用
1262
- # 获取数据表结构
1263
- cursor.execute(
1264
- "SELECT COLUMN_NAME FROM information_schema.columns "
1265
- "WHERE table_schema = %s AND table_name = %s",
1266
- (db_name, table_name)
1267
- )
1268
- cols_exist = [row['COLUMN_NAME'] for row in cursor.fetchall()]
1269
- update_columns = [col for col in cols_exist if col not in icm_update and col != 'id']
1566
+ return cleaned
1567
+
1568
+ def _check_table_exists(self, db_name: str, table_name: str) -> bool:
1569
+ """检查表是否存在"""
1570
+ db_name = self._validate_identifier(db_name)
1571
+ table_name = self._validate_identifier(table_name)
1572
+ sql = """
1573
+ SELECT TABLE_NAME
1574
+ FROM INFORMATION_SCHEMA.TABLES
1575
+ WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
1576
+ """
1577
+
1578
+ try:
1579
+ with self._get_connection() as conn:
1580
+ with conn.cursor() as cursor:
1581
+ cursor.execute(sql, (db_name, table_name))
1582
+ exists = bool(cursor.fetchone())
1583
+ return exists
1584
+ except Exception as e:
1585
+ self._log_with_metrics('error', f"检查数据表是否存在时发生未知错误: {e}", )
1586
+ raise
1587
+
1588
+ def _create_table(
1589
+ self,
1590
+ db_name: str,
1591
+ table_name: str,
1592
+ set_typ: Dict[str, str],
1593
+ primary_keys: Optional[List[str]] = None,
1594
+ date_column: Optional[str] = None,
1595
+ indexes: Optional[List[str]] = None,
1596
+ allow_null: bool = False
1597
+ ):
1598
+ """
1599
+ 创建数据表
1270
1600
 
1271
- # 构建WHERE条件
1272
- where_conditions = []
1273
- where_values = []
1274
- for col in icm_update:
1275
- where_conditions.append(f"`{col}` = %s")
1276
- where_values.append(processed_data[col])
1601
+ :param db_name: 数据库名
1602
+ :param table_name: 表名
1603
+ :param set_typ: 列名和数据类型字典 {列名: 数据类型}
1604
+ :param primary_keys: 主键列列表
1605
+ :param date_column: 日期列名,如果存在将设置为索引
1606
+ :param indexes: 需要创建索引的列列表
1607
+ """
1608
+ db_name = self._validate_identifier(db_name)
1609
+ table_name = self._validate_identifier(table_name)
1277
1610
 
1278
- # 查询现有数据
1279
- select_sql = f"SELECT {', '.join([f'`{col}`' for col in update_columns])} " \
1280
- f"FROM `{table_name}` WHERE {' AND '.join(where_conditions)}"
1281
- cursor.execute(select_sql, where_values)
1282
- existing_data = cursor.fetchone()
1611
+ if not set_typ:
1612
+ error_msg = "No columns specified for table creation"
1613
+ self._log_with_metrics('error', error_msg)
1614
+ raise ValueError(error_msg)
1283
1615
 
1284
- if existing_data:
1285
- # 比较并构建更新语句
1286
- update_set = []
1287
- update_values = []
1288
- for col in update_columns:
1289
- db_value = existing_data[col]
1290
- new_value = processed_data[col]
1616
+ # 构建列定义SQL
1617
+ column_defs = ["`id` INT NOT NULL AUTO_INCREMENT"]
1291
1618
 
1292
- # 处理数值类型的精度差异
1293
- if isinstance(db_value, float) and isinstance(new_value, float):
1294
- if not math.isclose(db_value, new_value, rel_tol=1e-9):
1295
- update_set.append(f"`{col}` = %s")
1296
- update_values.append(new_value)
1297
- elif db_value != new_value:
1298
- update_set.append(f"`{col}` = %s")
1299
- update_values.append(new_value)
1619
+ # 添加其他列定义
1620
+ for col_name, col_type in set_typ.items():
1621
+ # 跳过id列,因为已经在前面添加了
1622
+ if col_name.lower() == 'id':
1623
+ continue
1624
+ safe_col_name = self._validate_identifier(col_name)
1625
+ col_def = f"`{safe_col_name}` {col_type}"
1300
1626
 
1301
- if update_set:
1302
- update_sql = f"UPDATE `{table_name}` SET {', '.join(update_set)} " \
1303
- f"WHERE {' AND '.join(where_conditions)}"
1304
- cursor.execute(update_sql, update_values + where_values)
1305
- else:
1306
- cursor.execute(insert_sql, values)
1307
- else:
1308
- # 普通插入
1309
- cursor.execute(insert_sql, values)
1310
- except Exception as e:
1311
- pass
1312
- connection.commit() # 提交事务
1313
- connection.close()
1627
+ # 根据allow_null决定是否添加NOT NULL约束
1628
+ if not allow_null and not col_type.lower().startswith('json'):
1629
+ col_def += " NOT NULL"
1314
1630
 
1631
+ column_defs.append(col_def)
1315
1632
 
1316
- class OptimizeDatas:
1317
- """
1318
- 数据维护 删除 mysql 的冗余数据
1319
- 更新过程:
1320
- 1. 读取所有数据表
1321
- 2. 遍历表, 遍历列, 如果存在日期列则按天遍历所有日期, 不存在则全表读取
1322
- 3. 按天删除所有冗余数据(存在日期列时)
1323
- tips: 查找冗余数据的方式是创建一个临时迭代器, 逐行读取数据并添加到迭代器, 出现重复时将重复数据的 id 添加到临时列表, 按列表 id 执行删除
1324
- """
1325
- def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
1326
- self.username = username
1327
- self.password = password
1328
- self.host = host
1329
- self.port = port # 默认端口, 此后可能更新,不作为必传参数
1330
- self.charset = charset
1331
- self.config = {
1332
- 'host': self.host,
1333
- 'port': int(self.port),
1334
- 'user': self.username,
1335
- 'password': self.password,
1336
- 'charset': self.charset, # utf8mb4 支持存储四字节的UTF-8字符集
1337
- 'cursorclass': pymysql.cursors.DictCursor,
1338
- }
1339
- self.db_name_lists: list = [] # 更新多个数据库 删除重复数据
1340
- self.db_name = None
1341
- self.days: int = 63 # 对近 N 天的数据进行排重
1342
- self.end_date = None
1343
- self.start_date = None
1344
- self.connection = None
1633
+ # 添加主键定义
1634
+ if primary_keys:
1635
+ # 确保id在主键中
1636
+ if 'id' not in [pk.lower() for pk in primary_keys]:
1637
+ primary_keys = ['id'] + primary_keys
1638
+ else:
1639
+ # 如果没有指定主键,则使用id作为主键
1640
+ primary_keys = ['id']
1345
1641
 
1346
- @staticmethod
1347
- def try_except(func): # 在类内部定义一个异常处理方法
1642
+ # 添加主键定义
1643
+ safe_primary_keys = [self._validate_identifier(pk) for pk in primary_keys]
1644
+ primary_key_sql = f", PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
1348
1645
 
1349
- @wraps(func)
1350
- def wrapper(*args, **kwargs):
1351
- try:
1352
- return func(*args, **kwargs)
1353
- except Exception as e:
1354
- logger.error(f'{func.__name__}, {e}') # 将异常信息返回
1646
+ # 构建完整SQL
1647
+ sql = f"""
1648
+ CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
1649
+ {','.join(column_defs)}
1650
+ {primary_key_sql}
1651
+ ) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
1652
+ """
1355
1653
 
1356
- return wrapper
1654
+ try:
1655
+ with self._get_connection() as conn:
1656
+ with conn.cursor() as cursor:
1657
+ cursor.execute(sql)
1658
+ self._log_with_metrics('info', f"{db_name}.{table_name}: 数据表已创建")
1659
+
1660
+ # 添加普通索引
1661
+ index_statements = []
1662
+
1663
+ # 日期列索引
1664
+ if date_column and date_column in set_typ:
1665
+ safe_date_col = self._validate_identifier(date_column)
1666
+ index_statements.append(
1667
+ f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_date_col}` (`{safe_date_col}`)"
1668
+ )
1669
+
1670
+ # 其他索引
1671
+ if indexes:
1672
+ for idx_col in indexes:
1673
+ if idx_col in set_typ:
1674
+ safe_idx_col = self._validate_identifier(idx_col)
1675
+ index_statements.append(
1676
+ f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)"
1677
+ )
1678
+
1679
+ # 执行所有索引创建语句
1680
+ if index_statements:
1681
+ with conn.cursor() as cursor:
1682
+ for stmt in index_statements:
1683
+ cursor.execute(stmt)
1684
+ self._log_with_metrics('debug', f"Executed index statement: {stmt}", )
1685
+
1686
+ conn.commit()
1687
+ self._log_with_metrics('info', f"{db_name}.{table_name}: 索引已添加")
1357
1688
 
1358
- def keep_connect(self, _db_name, _config, max_try: int=10):
1359
- attempts = 1
1360
- while attempts <= max_try:
1689
+ except Exception as e:
1690
+ self._log_with_metrics('error', f"{db_name}.{table_name}: 建表失败: {str(e)}")
1691
+ conn.rollback()
1692
+ raise
1693
+
1694
+ def _validate_datetime(self, value):
1695
+ formats = [
1696
+ '%Y-%m-%d %H:%M:%S',
1697
+ '%Y-%m-%d',
1698
+ '%Y/%m/%d %H:%M:%S',
1699
+ '%Y/%m/%d',
1700
+ '%Y%m%d',
1701
+ '%Y-%m-%dT%H:%M:%S',
1702
+ '%Y-%m-%d %H:%M:%S.%f'
1703
+ ]
1704
+ for fmt in formats:
1361
1705
  try:
1362
- connection = pymysql.connect(**_config) # 连接数据库
1363
- return connection
1364
- except Exception as e:
1365
- logger.error(f'{_db_name}连接失败,正在重试: {self.host}:{self.port} {attempts}/{max_try} {e}')
1366
- attempts += 1
1367
- time.sleep(30)
1368
- logger.error(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
1369
- return None
1706
+ return datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d %H:%M:%S')
1707
+ except ValueError:
1708
+ continue
1709
+ raise ValueError(f"无效的日期格式: {value}")
1370
1710
 
1371
- def optimize_list(self):
1711
+ def _validate_value(self, value: Any, column_type: str) -> Any:
1372
1712
  """
1373
- 更新多个数据库 移除冗余数据
1374
- 需要设置 self.db_name_lists
1713
+ 验证并清理数据值,根据列类型进行适当转换
1714
+
1715
+ :param value: 要验证的值
1716
+ :param column_type: 列的数据类型
1717
+ :return: 清理后的值
1718
+ :raises ValueError: 如果值转换失败
1375
1719
  """
1376
- if not self.db_name_lists:
1377
- logger.info(f'尚未设置参数: self.db_name_lists')
1378
- return
1379
- for db_name in self.db_name_lists:
1380
- self.db_name = db_name
1381
- self.optimize()
1720
+ if value is None:
1721
+ return None
1382
1722
 
1383
- def optimize(self, except_key=['更新时间']):
1384
- """ 更新一个数据库 移除冗余数据 """
1385
- if not self.db_name:
1386
- logger.info(f'尚未设置参数: self.db_name')
1387
- return
1388
- tables = self.table_list(db_name=self.db_name)
1389
- if not tables:
1390
- logger.info(f'{self.db_name} -> 数据表不存在')
1391
- return
1723
+ try:
1724
+ column_type_lower = column_type.lower()
1392
1725
 
1393
- # 日期初始化
1394
- if not self.end_date:
1395
- self.end_date = pd.to_datetime(datetime.datetime.today())
1396
- else:
1397
- self.end_date = pd.to_datetime(self.end_date)
1398
- if self.days:
1399
- self.start_date = pd.to_datetime(self.end_date - datetime.timedelta(days=self.days))
1400
- if not self.start_date:
1401
- self.start_date = self.end_date
1402
- else:
1403
- self.start_date = pd.to_datetime(self.start_date)
1404
- start_date_before = self.start_date
1405
- end_date_before = self.end_date
1726
+ if 'int' in column_type_lower:
1727
+ return int(value) if value is not None else None
1728
+ elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
1729
+ return float(value) if value is not None else None
1730
+ elif '日期' in column_type_lower or 'time' in column_type_lower:
1731
+ if isinstance(value, (datetime.datetime, pd.Timestamp)):
1732
+ return value.strftime('%Y-%m-%d %H:%M:%S')
1733
+ elif isinstance(value, str):
1734
+ try:
1735
+ return self._validate_datetime(value) # 使用专门的日期验证方法
1736
+ except ValueError as e:
1737
+ raise ValueError(f"无效日期格式: {value} - {str(e)}")
1738
+ return str(value)
1739
+ elif 'char' in column_type_lower or 'text' in column_type_lower:
1740
+ # 防止SQL注入
1741
+ if isinstance(value, str):
1742
+ return value.replace('\\', '\\\\').replace("'", "\\'")
1743
+ return str(value)
1744
+ elif 'json' in column_type_lower:
1745
+ import json
1746
+ return json.dumps(value) if value is not None else None
1747
+ else:
1748
+ return value
1749
+ except (ValueError, TypeError) as e:
1750
+ error_msg = f"数据类型转换异常 {value} to type {column_type}: {str(e)}"
1751
+ self._log_with_metrics('error', error_msg)
1752
+ raise ValueError(error_msg)
1406
1753
 
1407
- logger.info(f'mysql({self.host}: {self.port}) {self.db_name} 数据库优化中(日期长度: {self.days} 天)...')
1408
- for table_dict in tables:
1409
- for key, table_name in table_dict.items():
1410
- self.config.update({'database': self.db_name}) # 添加更新 config 字段
1411
- self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
1412
- if not self.connection:
1413
- return
1414
- with self.connection.cursor() as cursor:
1415
- sql = f"SELECT 1 FROM `{table_name}` LIMIT 1"
1416
- cursor.execute(sql)
1417
- result = cursor.fetchone()
1418
- if not result:
1419
- logger.info(f'数据表: {table_name}, 数据长度为 0')
1420
- continue # 检查数据表是否为空
1754
+ def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
1755
+ """获取表的列名和数据类型"""
1756
+ db_name = self._validate_identifier(db_name)
1757
+ table_name = self._validate_identifier(table_name)
1758
+ sql = """
1759
+ SELECT COLUMN_NAME, DATA_TYPE
1760
+ FROM INFORMATION_SCHEMA.COLUMNS
1761
+ WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
1762
+ ORDER BY ORDINAL_POSITION
1763
+ """
1421
1764
 
1422
- cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`") # 查询数据表的列信息
1423
- columns = cursor.fetchall()
1424
- date_exist = False
1425
- for col in columns: # 遍历列信息,检查是否存在类型为日期的列
1426
- if col['Field'] == '日期' and (col['Type'] == 'date' or col['Type'].startswith('datetime')):
1427
- date_exist = True
1428
- break
1429
- if date_exist: # 存在日期列
1430
- sql_max = f"SELECT MAX(日期) AS max_date FROM `{table_name}`"
1431
- sql_min = f"SELECT MIN(日期) AS min_date FROM `{table_name}`"
1432
- cursor.execute(sql_max)
1433
- max_result = cursor.fetchone()
1434
- cursor.execute(sql_min)
1435
- min_result = cursor.fetchone()
1436
- # 匹配修改为合适的起始和结束日期
1437
- if self.start_date < pd.to_datetime(min_result['min_date']):
1438
- self.start_date = pd.to_datetime(min_result['min_date'])
1439
- if self.end_date > pd.to_datetime(max_result['max_date']):
1440
- self.end_date = pd.to_datetime(max_result['max_date'])
1441
- dates_list = self.day_list(start_date=self.start_date, end_date=self.end_date)
1442
- # dates_list 是日期列表
1443
- for date in dates_list:
1444
- self.delete_duplicate(table_name=table_name, date=date, except_key=except_key)
1445
- self.start_date = start_date_before # 重置,不然日期错乱
1446
- self.end_date = end_date_before
1447
- else: # 不存在日期列的情况
1448
- self.delete_duplicate2(table_name=table_name, except_key=except_key)
1449
- self.connection.close()
1450
- logger.info(f'mysql({self.host}: {self.port}) {self.db_name} 数据库优化完成!')
1765
+ try:
1766
+ with self._get_connection() as conn:
1767
+ with conn.cursor() as cursor:
1768
+ cursor.execute(sql, (db_name, table_name))
1769
+ set_typ = {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
1770
+ self._log_with_metrics('debug', f"{db_name}.{table_name}: 获取表的列信息: {set_typ}")
1771
+ return set_typ
1772
+ except Exception as e:
1773
+ self._log_with_metrics('error', f"无法获取表列信息: {str(e)}")
1774
+ raise
1775
+
1776
+ def _upload_to_table(
1777
+ self,
1778
+ db_name: str,
1779
+ table_name: str,
1780
+ data: List[Dict],
1781
+ set_typ: Dict[str, str],
1782
+ primary_keys: Optional[List[str]],
1783
+ check_duplicate: bool,
1784
+ duplicate_columns: Optional[List[str]],
1785
+ allow_null: bool,
1786
+ auto_create: bool,
1787
+ date_column: Optional[str],
1788
+ indexes: Optional[List[str]],
1789
+ batch_id: Optional[str] = None
1790
+ ):
1791
+ """实际执行表上传的方法"""
1792
+ # 检查表是否存在
1793
+ if not self._check_table_exists(db_name, table_name):
1794
+ if auto_create:
1795
+ self._create_table(db_name, table_name, set_typ, primary_keys, date_column, indexes,
1796
+ allow_null=allow_null)
1797
+ else:
1798
+ error_msg = f"数据表不存在: '{db_name}.{table_name}'"
1799
+ self._log_with_metrics('error', error_msg)
1800
+ raise ValueError(error_msg)
1801
+
1802
+ # 获取表结构并验证
1803
+ table_columns = self._get_table_columns(db_name, table_name)
1804
+ if not table_columns:
1805
+ error_msg = f"获取列失败 '{db_name}.{table_name}'"
1806
+ self._log_with_metrics('error', error_msg)
1807
+ raise ValueError(error_msg)
1808
+
1809
+ # 验证数据列与表列匹配
1810
+ for col in set_typ:
1811
+ if col not in table_columns:
1812
+ error_msg = f"列不存在: '{col}' -> '{db_name}.{table_name}'"
1813
+ self._log_with_metrics('error', error_msg)
1814
+ raise ValueError(error_msg)
1815
+
1816
+ # 插入数据
1817
+ self._insert_data(
1818
+ db_name, table_name, data, set_typ,
1819
+ check_duplicate, duplicate_columns
1820
+ )
1451
1821
 
1452
- def delete_duplicate(self, table_name, date, except_key=['更新时间']):
1453
- datas = self.table_datas(db_name=self.db_name, table_name=str(table_name), date=date)
1454
- if not datas:
1455
- return
1456
- duplicate_id = [] # 出现重复的 id
1457
- all_datas = [] # 迭代器
1458
- for data in datas:
1459
- for e_key in except_key:
1460
- if e_key in data.keys(): # 在检查重复数据时,不包含 更新时间 字段
1461
- del data[e_key]
1822
+ def _prepare_data(
1823
+ self,
1824
+ data: Union[Dict, List[Dict], pd.DataFrame],
1825
+ set_typ: Dict[str, str],
1826
+ allow_null: bool = False
1827
+ ) -> List[Dict]:
1828
+ """
1829
+ 准备要上传的数据,验证并转换数据类型
1830
+
1831
+ :param data: 输入数据
1832
+ :param set_typ: 列名和数据类型字典 {列名: 数据类型}
1833
+ :param allow_null: 是否允许空值
1834
+ :return: 准备好的数据列表
1835
+ :raises ValueError: 如果数据验证失败
1836
+ """
1837
+ # 统一数据格式为字典列表
1838
+ if isinstance(data, pd.DataFrame):
1462
1839
  try:
1463
- delete_id = data['id']
1464
- del data['id']
1465
- data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
1466
- if data in all_datas: # 数据出现重复时
1467
- if delete_id:
1468
- duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
1469
- continue
1470
- all_datas.append(data) # 数据没有重复
1840
+ data = data.replace({pd.NA: None}).to_dict('records')
1471
1841
  except Exception as e:
1472
- logger.debug(f'{table_name} 函数: mysql - > OptimizeDatas -> delete_duplicate -> {e}')
1473
- del all_datas
1842
+ self._log_with_metrics("error", f"转为为字典时发生错误: {e}", )
1843
+ raise ValueError(f"转为为字典时发生错误: {e}")
1844
+ elif isinstance(data, dict):
1845
+ data = [data]
1846
+ elif not isinstance(data, list) or not all(isinstance(item, dict) for item in data):
1847
+ error_msg = "Data must be a dict, list of dicts, or DataFrame"
1848
+ self._log_with_metrics('error', error_msg)
1849
+ raise ValueError(error_msg)
1474
1850
 
1475
- if not duplicate_id: # 如果没有重复数据,则跳过该数据表
1476
- return
1851
+ prepared_data = []
1852
+ for row_idx, row in enumerate(data, 1):
1853
+ prepared_row = {}
1854
+ for col_name, col_type in set_typ.items():
1855
+ # 跳过id列,不允许外部传入id
1856
+ if col_name.lower() == 'id':
1857
+ continue
1477
1858
 
1478
- try:
1479
- with self.connection.cursor() as cursor:
1480
- placeholders = ', '.join(['%s'] * len(duplicate_id))
1481
- # 移除冗余数据
1482
- sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
1483
- cursor.execute(sql, duplicate_id)
1484
- logger.debug(f"{table_name} -> {date.strftime('%Y-%m-%d')} before: {len(datas)}, remove: {cursor.rowcount}")
1485
- self.connection.commit() # 提交事务
1486
- except Exception as e:
1487
- logger.error(f'{self.db_name}/{table_name}, {e}')
1488
- self.connection.rollback() # 异常则回滚
1859
+ if col_name not in row:
1860
+ if not allow_null:
1861
+ error_msg = f"Row {row_idx}: Missing required column '{col_name}' in data"
1862
+ self._log_with_metrics('error', error_msg)
1863
+ raise ValueError(error_msg)
1864
+ prepared_row[col_name] = None
1865
+ else:
1866
+ try:
1867
+ prepared_row[col_name] = self._validate_value(row[col_name], col_type)
1868
+ except ValueError as e:
1869
+ error_msg = f"Row {row_idx}, column '{col_name}': {str(e)}"
1870
+ self._log_with_metrics('error', error_msg)
1871
+ raise ValueError(error_msg)
1872
+ prepared_data.append(prepared_row)
1489
1873
 
1490
- def delete_duplicate2(self, table_name, except_key=['更新时间']):
1491
- with self.connection.cursor() as cursor:
1492
- sql = f"SELECT * FROM `{table_name}`" # 如果不包含日期列,则获取全部数据
1493
- cursor.execute(sql)
1494
- datas = cursor.fetchall()
1495
- if not datas:
1496
- return
1497
- duplicate_id = [] # 出现重复的 id
1498
- all_datas = [] # 迭代器
1499
- for data in datas:
1500
- for e_key in except_key:
1501
- if e_key in data.keys(): # 在检查重复数据时,不包含 更新时间 字段
1502
- del data[e_key]
1503
- delete_id = data['id']
1504
- del data['id']
1505
- data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
1506
- if data in all_datas: # 数据出现重复时
1507
- duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
1508
- continue
1509
- all_datas.append(data) # 数据没有重复
1510
- del all_datas
1874
+ self._log_with_metrics('debug', f"已准备 {len(prepared_data)} 行数据")
1875
+ return prepared_data
1511
1876
 
1512
- if not duplicate_id: # 如果没有重复数据,则跳过该数据表
1513
- return
1877
+ def upload_data(
1878
+ self,
1879
+ db_name: str,
1880
+ table_name: str,
1881
+ data: Union[Dict, List[Dict], pd.DataFrame],
1882
+ set_typ: Dict[str, str],
1883
+ primary_keys: Optional[List[str]] = None,
1884
+ check_duplicate: bool = False,
1885
+ duplicate_columns: Optional[List[str]] = None,
1886
+ allow_null: bool = False,
1887
+ partition_by: Optional[str] = None,
1888
+ partition_date_column: str = '日期',
1889
+ auto_create: bool = True,
1890
+ indexes: Optional[List[str]] = None
1891
+ ):
1892
+ """
1893
+ 上传数据到数据库
1894
+ """
1895
+ upload_start = time.time()
1896
+ self._record_metrics('total_uploads', 1)
1897
+ initial_row_count = len(data) if hasattr(data, '__len__') else 1
1898
+ self.metrics['total_rows'] += len(data) if hasattr(data, '__len__') else 1
1899
+
1900
+ batch_id = f"batch_{int(time.time() * 1000)}"
1901
+ success_flag = False
1902
+
1903
+ self._log_with_metrics('info', "开始上传数据", {
1904
+ 'batch_id': batch_id,
1905
+ 'database': db_name,
1906
+ 'table': table_name,
1907
+ 'partition_by': partition_by,
1908
+ 'check_duplicate': check_duplicate,
1909
+ 'row_count': len(data) if hasattr(data, '__len__') else 1,
1910
+ 'auto_create': auto_create
1911
+ })
1514
1912
 
1515
1913
  try:
1516
- with self.connection.cursor() as cursor:
1517
- placeholders = ', '.join(['%s'] * len(duplicate_id))
1518
- # 移除冗余数据
1519
- sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
1520
- cursor.execute(sql, duplicate_id)
1521
- logger.info(f"{table_name} -> before: {len(datas)}, "
1522
- f"remove: {cursor.rowcount}")
1523
- self.connection.commit() # 提交事务
1524
- except Exception as e:
1525
- logger.error(f'{self.db_name}/{table_name}, {e}')
1526
- self.connection.rollback() # 异常则回滚
1914
+ # 验证参数
1915
+ if not set_typ:
1916
+ error_msg = "必须指定列定义"
1917
+ self._log_with_metrics('error', error_msg)
1918
+ raise ValueError(error_msg)
1919
+
1920
+ if partition_by and partition_by not in ['year', 'month']:
1921
+ error_msg = "分表方式必须是 'year' 或 'month'"
1922
+ self._log_with_metrics('error', error_msg)
1923
+ raise ValueError(error_msg)
1924
+
1925
+ # 准备数据
1926
+ prepared_data = self._prepare_data(data, set_typ, allow_null)
1927
+
1928
+ # 检查数据库是否存在
1929
+ if not self._check_database_exists(db_name):
1930
+ if auto_create:
1931
+ self._create_database(db_name)
1932
+ else:
1933
+ error_msg = f"数据库不存在: '{db_name}'"
1934
+ self._log_with_metrics('error', error_msg)
1935
+ raise ValueError(error_msg)
1936
+
1937
+ # 处理分表逻辑
1938
+ if partition_by:
1939
+ partitioned_data = {}
1940
+ for row in prepared_data:
1941
+ try:
1942
+ if partition_date_column not in row:
1943
+ error_msg = f"异常缺失列 '{partition_date_column}'"
1944
+ self._log_with_metrics('error', error_msg)
1945
+ continue # 跳过当前行
1946
+
1947
+ part_table = self._get_partition_table_name(
1948
+ table_name,
1949
+ str(row[partition_date_column]),
1950
+ partition_by
1951
+ )
1952
+ if part_table not in partitioned_data:
1953
+ partitioned_data[part_table] = []
1954
+ partitioned_data[part_table].append(row)
1955
+ except Exception as e:
1956
+ self._log_with_metrics('error', "分表处理失败", {
1957
+ 'row_data': row,
1958
+ 'error': str(e)
1959
+ })
1960
+ continue # 跳过当前行
1961
+
1962
+ # 对每个分表执行上传
1963
+ for part_table, part_data in partitioned_data.items():
1964
+ try:
1965
+ self._upload_to_table(
1966
+ db_name, part_table, part_data, set_typ,
1967
+ primary_keys, check_duplicate, duplicate_columns,
1968
+ allow_null, auto_create, partition_date_column,
1969
+ indexes, batch_id
1970
+ )
1971
+ except Exception as e:
1972
+ self._log_with_metrics('error', "分表上传失败", {
1973
+ 'partition_table': part_table,
1974
+ 'error': str(e)
1975
+ })
1976
+ continue # 跳过当前分表,继续处理其他分表
1977
+ else:
1978
+ # 不分表,直接上传
1979
+ self._upload_to_table(
1980
+ db_name, table_name, prepared_data, set_typ,
1981
+ primary_keys, check_duplicate, duplicate_columns,
1982
+ allow_null, auto_create, partition_date_column,
1983
+ indexes, batch_id
1984
+ )
1527
1985
 
1528
- def database_list(self):
1529
- """ 获取所有数据库 """
1530
- connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
1531
- if not connection:
1532
- return
1533
- with connection.cursor() as cursor:
1534
- cursor.execute("SHOW DATABASES")
1535
- databases = cursor.fetchall() # 获取所有数据库的结果
1536
- connection.close()
1537
- return databases
1986
+ success_flag = True
1538
1987
 
1539
- def table_list(self, db_name):
1540
- """ 获取指定数据库的所有数据表 """
1541
- connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
1542
- if not connection:
1543
- return
1544
- try:
1545
- with connection.cursor() as cursor:
1546
- cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
1547
- database_exists = cursor.fetchone()
1548
- if not database_exists:
1549
- logger.info(f'{db_name}: 数据表不存在!')
1550
- return
1551
1988
  except Exception as e:
1552
- logger.error(f'002 {e}')
1553
- return
1989
+ self._log_with_metrics('error', "上传过程中发生全局错误", {
1990
+ 'error': str(e),
1991
+ 'error_type': type(e).__name__
1992
+ })
1554
1993
  finally:
1555
- connection.close() # 断开连接
1994
+ elapsed = time.time() - upload_start
1995
+ self._record_metrics('upload_execution_time', elapsed, is_timing=True)
1556
1996
 
1557
- self.config.update({'database': db_name}) # 添加更新 config 字段
1558
- connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
1559
- if not connection:
1560
- return
1561
- with connection.cursor() as cursor:
1562
- cursor.execute("SHOW TABLES")
1563
- tables = cursor.fetchall() # 获取所有数据表
1564
- connection.close()
1565
- return tables
1997
+ if success_flag:
1998
+ self._record_metrics('successful_uploads', 1)
1999
+ else:
2000
+ self._record_metrics('failed_uploads', 1)
1566
2001
 
1567
- def table_datas(self, db_name, table_name, date):
2002
+ self._log_with_metrics('info', "上传处理完成", {
2003
+ 'batch_id': batch_id,
2004
+ 'success': success_flag,
2005
+ 'time_elapsed': elapsed,
2006
+ 'initial_row_count': initial_row_count,
2007
+ 'processed_rows': self.metrics['successful_rows'] + self.metrics['failed_rows']
2008
+ })
2009
+
2010
+ def _insert_data(
2011
+ self,
2012
+ db_name: str,
2013
+ table_name: str,
2014
+ data: List[Dict],
2015
+ set_typ: Dict[str, str],
2016
+ check_duplicate: bool = False,
2017
+ duplicate_columns: Optional[List[str]] = None,
2018
+ batch_size: int = 1000,
2019
+ batch_id: Optional[str] = None
2020
+ ):
1568
2021
  """
1569
- 获取指定数据表的数据, 按天获取
2022
+ 插入数据到表中,增强日志记录和性能监控
1570
2023
  """
1571
- self.config.update({'database': db_name}) # 添加更新 config 字段
1572
- connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
1573
- if not connection:
2024
+ if not data:
1574
2025
  return
2026
+
2027
+ # 获取所有列名(排除id列)
2028
+ all_columns = [col for col in set_typ.keys() if col.lower() != 'id']
2029
+ safe_columns = [self._validate_identifier(col) for col in all_columns]
2030
+ placeholders = ','.join(['%s'] * len(safe_columns))
2031
+
2032
+ # 构建基础SQL语句
2033
+ if check_duplicate:
2034
+ if not duplicate_columns:
2035
+ duplicate_columns = all_columns
2036
+
2037
+ safe_dup_columns = [self._validate_identifier(col) for col in duplicate_columns]
2038
+ conditions = [f"`{col}` = %s" for col in safe_dup_columns]
2039
+ where_clause = " AND ".join(conditions)
2040
+
2041
+ sql = f"""
2042
+ INSERT INTO `{db_name}`.`{table_name}`
2043
+ (`{'`,`'.join(safe_columns)}`)
2044
+ SELECT {placeholders}
2045
+ FROM DUAL
2046
+ WHERE NOT EXISTS (
2047
+ SELECT 1 FROM `{db_name}`.`{table_name}`
2048
+ WHERE {where_clause}
2049
+ )
2050
+ """
2051
+ else:
2052
+ sql = f"""
2053
+ INSERT INTO `{db_name}`.`{table_name}`
2054
+ (`{'`,`'.join(safe_columns)}`)
2055
+ VALUES ({placeholders})
2056
+ """
2057
+
2058
+ total_inserted = 0
2059
+ total_skipped = 0
2060
+ total_failed = 0 # 失败计数器
2061
+
2062
+ # 分批插入数据
2063
+ with self._get_connection() as conn:
2064
+ with conn.cursor() as cursor:
2065
+ for i in range(0, len(data), batch_size):
2066
+ batch_start = time.time()
2067
+ batch = data[i:i + batch_size]
2068
+ successful_rows = 0 # 当前批次成功数
2069
+
2070
+ for row in batch:
2071
+ try:
2072
+ row_values = [row.get(col) for col in all_columns]
2073
+ if check_duplicate:
2074
+ row_values += [row.get(col) for col in duplicate_columns]
2075
+
2076
+ cursor.execute(sql, row_values)
2077
+ successful_rows += 1
2078
+ conn.commit() # 每次成功插入后提交
2079
+
2080
+ except Exception as e:
2081
+ conn.rollback() # 回滚当前行的事务
2082
+ total_failed += 1
2083
+
2084
+ # 记录失败行详细信息
2085
+ error_details = {
2086
+ 'batch_id': batch_id,
2087
+ 'database': db_name,
2088
+ 'table': table_name,
2089
+ 'row_data': row,
2090
+ 'error_type': type(e).__name__,
2091
+ 'error_message': str(e)
2092
+ }
2093
+ self._log_with_metrics('error', "单行插入失败", error_details)
2094
+ continue # 跳过当前行,继续处理下一行
2095
+
2096
+ # 更新统计信息
2097
+ if check_duplicate:
2098
+ cursor.execute("SELECT ROW_COUNT()")
2099
+ affected_rows = cursor.rowcount
2100
+ total_inserted += affected_rows
2101
+ total_skipped += len(batch) - affected_rows - (len(batch) - successful_rows)
2102
+ else:
2103
+ total_inserted += successful_rows
2104
+
2105
+ batch_elapsed = time.time() - batch_start
2106
+ self._record_metrics('batch_execution_time', batch_elapsed, is_timing=True)
2107
+
2108
+ batch_info = {
2109
+ 'batch_id': batch_id,
2110
+ 'batch_index': i // batch_size + 1,
2111
+ 'total_batches': (len(data) + batch_size - 1) // batch_size,
2112
+ 'batch_size': len(batch),
2113
+ 'successful_rows': successful_rows,
2114
+ 'failed_rows': len(batch) - successful_rows,
2115
+ 'time_elapsed': batch_elapsed,
2116
+ 'rows_per_second': successful_rows / batch_elapsed if batch_elapsed > 0 else 0
2117
+ }
2118
+ self._log_with_metrics('debug', "批次处理完成", batch_info)
2119
+
2120
+ # 更新全局指标
2121
+ self.metrics['failed_rows'] += total_failed
2122
+ self._log_with_metrics('info', "数据插入完成", {
2123
+ 'total_rows': len(data),
2124
+ 'inserted_rows': total_inserted,
2125
+ 'skipped_rows': total_skipped,
2126
+ 'failed_rows': total_failed
2127
+ })
2128
+
2129
+ def get_metrics(self) -> Dict:
2130
+ """获取当前性能指标"""
2131
+ metrics = self.metrics.copy()
2132
+
2133
+ # 添加当前系统指标
2134
+ metrics.update({
2135
+ 'current_time': datetime.datetime.now().isoformat(),
2136
+ 'system': self._get_system_metrics(),
2137
+ 'connection_pool': {
2138
+ 'size': self.pool_size,
2139
+ 'active': len(self.pool._connections) if hasattr(self.pool, '_connections') else 0
2140
+ }
2141
+ })
2142
+
2143
+ return metrics
2144
+
2145
+ def close(self):
2146
+ """关闭连接池并记录最终指标"""
2147
+ close_start = time.time()
2148
+
1575
2149
  try:
1576
- with connection.cursor() as cursor:
1577
- sql = f"SELECT * FROM `{table_name}` WHERE {'日期'} BETWEEN '%s' AND '%s'" % (date, date)
1578
- cursor.execute(sql)
1579
- results = cursor.fetchall()
2150
+ if hasattr(self, 'pool') and self.pool is not None:
2151
+ # 记录关闭前的连接池状态
2152
+ active_connections = self._get_system_metrics().get('connections', 0)
2153
+
2154
+ # 更安全的关闭方式
2155
+ try:
2156
+ self.pool.close()
2157
+ except Exception as e:
2158
+ self._log_with_metrics('warning', "关闭连接池时出错", {
2159
+ 'error': str(e)
2160
+ })
2161
+
2162
+ self.pool = None
2163
+
2164
+ elapsed = time.time() - close_start
2165
+ self._log_with_metrics('info', "连接池已关闭", {
2166
+ 'active_connections_before_close': active_connections,
2167
+ 'close_time_elapsed': elapsed
2168
+ })
1580
2169
  except Exception as e:
1581
- logger.error(f'001 {e}')
2170
+ elapsed = time.time() - close_start
2171
+ self._log_with_metrics('error', "关闭连接池失败", {
2172
+ 'error': str(e),
2173
+ 'close_time_elapsed': elapsed
2174
+ })
2175
+ raise
1582
2176
  finally:
1583
- connection.close()
1584
- return results
1585
-
1586
- def day_list(self, start_date, end_date):
1587
- start_date = pd.to_datetime(start_date)
1588
- end_date = pd.to_datetime(end_date)
1589
- date_list = []
1590
- while start_date <= end_date:
1591
- date_list.append(pd.to_datetime(start_date.date()))
1592
- start_date += datetime.timedelta(days=1)
1593
- return date_list
2177
+ # 记录最终性能指标
2178
+ if hasattr(self, 'logger') and self.logger and self.enable_metrics:
2179
+ self._log_with_metrics('debug', "最终性能指标", self.get_metrics())
1594
2180
 
1595
- def rename_column(self):
1596
- """ 批量修改数据库的列名 """
1597
- """
1598
- # for db_name in ['京东数据2', '推广数据2', '市场数据2', '生意参谋2', '生意经2', '属性设置2',]:
1599
- # s = OptimizeDatas(username=username, password=password, host=host, port=port)
1600
- # s.db_name = db_name
1601
- # s.rename_column()
1602
- """
1603
- tables = self.table_list(db_name=self.db_name)
1604
- for table_dict in tables:
1605
- for key, table_name in table_dict.items():
1606
- self.config.update({'database': self.db_name}) # 添加更新 config 字段
1607
- self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
1608
- if not self.connection:
1609
- return
1610
- with self.connection.cursor() as cursor:
1611
- cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`") # 查询数据表的列信息
1612
- columns = cursor.fetchall()
1613
- columns = [{column['Field']: column['Type']} for column in columns]
1614
- for column in columns:
1615
- for key, value in column.items():
1616
- if key.endswith('_'):
1617
- new_name = re.sub(r'_+$', '', key)
1618
- sql = f"ALTER TABLE `{table_name}` CHANGE COLUMN {key} {new_name} {value}"
1619
- cursor.execute(sql)
1620
- self.connection.commit()
1621
- if self.connection:
1622
- self.connection.close()
2181
+ def __main__():
2182
+ pass
1623
2183
 
1624
2184
 
1625
2185
  if __name__ == '__main__':
1626
2186
  pass
2187
+
2188
+ # 初始化上传器
2189
+ uploader = MySQLUploader(
2190
+ username='root',
2191
+ password='1',
2192
+ host='localhost',
2193
+ port=3306,
2194
+ logging_mode='both',
2195
+ log_level='info'
2196
+ )
2197
+
2198
+ # 定义列和数据类型
2199
+ set_typ = {
2200
+ 'id': 'INT',
2201
+ 'name': 'VARCHAR(255)',
2202
+ 'age': 'INT',
2203
+ 'salary': 'DECIMAL(10,2)',
2204
+ '日期': 'DATE'
2205
+ }
2206
+
2207
+ # 准备数据
2208
+ data = [
2209
+ {'日期': '2023-01-15', 'name': 'Alice', 'age': 35, 'salary': 100},
2210
+ {'日期': '2023-01-15', 'name': 'Alice', 'age': 30, 'salary': 0.0},
2211
+ {'日期': '2023-02-20', 'name': 'Bob', 'age': 25, 'salary': 45000.75}
2212
+ ]
2213
+
2214
+ # 上传数据
2215
+ uploader.upload_data(
2216
+ db_name='测试库',
2217
+ table_name='测试表',
2218
+ data=data,
2219
+ set_typ=set_typ, # 定义列和数据类型
2220
+ primary_keys=[], # 指定主键
2221
+ check_duplicate=True, # 检查重复数据
2222
+ duplicate_columns=['name', 'age'], #
2223
+ allow_null=False, # 允许插入空值
2224
+ partition_by='year', # 按月分表
2225
+ partition_date_column = '日期', # 用于分表的日期列名,默认为'日期'
2226
+ auto_create = True, # 表不存在时自动创建, 默认参数不要更改
2227
+ indexes = ['name'], # 指定索引列
2228
+ )
2229
+
2230
+ # 关闭上传器
2231
+ uploader.close()