mdbq 4.1.14__py3-none-any.whl → 4.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mdbq might be problematic. Click here for more details.

mdbq/mysql/uploader.py CHANGED
@@ -1,659 +1,278 @@
1
1
  # -*- coding:utf-8 -*-
2
+ """
3
+ MySQL数据上传器 - 重构版本
4
+ 提供高可用、易维护的MySQL数据上传功能
5
+ """
6
+
2
7
  import datetime
3
- import re
4
8
  import time
9
+ import json
10
+ import re
11
+ from typing import Union, List, Dict, Optional, Any, Tuple
5
12
  from functools import wraps
6
- import warnings
13
+ from decimal import Decimal, InvalidOperation
14
+ import math
15
+
7
16
  import pymysql
8
17
  import pandas as pd
9
- import os
10
- from mdbq.log import mylogger
11
- from mdbq.myconf import myconf
12
- from typing import Union, List, Dict, Optional, Any, Tuple, Set
13
18
  from dbutils.pooled_db import PooledDB
14
- from decimal import Decimal, InvalidOperation
15
- import math
16
- import json
19
+ from mdbq.log import mylogger
20
+ # from mdbq.myconf import myconf
17
21
 
18
- warnings.filterwarnings('ignore')
22
+ # 配置日志
19
23
  logger = mylogger.MyLogger(
20
24
  logging_mode='file',
21
25
  log_level='info',
22
26
  log_format='json',
23
27
  max_log_size=50,
24
28
  backup_count=5,
25
- enable_async=False, # 是否启用异步日志
26
- sample_rate=1, # 采样DEBUG/INFO日志
27
- sensitive_fields=[], # 敏感字段过滤
28
- enable_metrics=False, # 是否启用性能指标
29
+ enable_async=False,
30
+ sample_rate=1,
31
+ sensitive_fields=[],
32
+ enable_metrics=False,
29
33
  )
30
34
 
31
35
 
32
- def count_decimal_places(num_str: str) -> Tuple[int, int]:
33
- """
34
- 统计小数点前后位数,支持科学计数法。
35
- 返回:(整数位数, 小数位数)
36
- """
37
- try:
38
- d = Decimal(str(num_str))
39
- sign, digits, exponent = d.as_tuple()
40
- int_part = len(digits) + exponent if exponent < 0 else len(digits)
41
- dec_part = -exponent if exponent < 0 else 0
42
- return max(int_part, 0), max(dec_part, 0)
43
- except (InvalidOperation, ValueError, TypeError):
44
- return (0, 0)
45
-
46
-
47
- class StatementCache(dict):
48
- """LRU缓存实现,用于SQL语句缓存"""
49
- def __init__(self, maxsize=100):
50
- super().__init__()
51
- self._maxsize = maxsize
52
- self._order = []
53
- def __getitem__(self, key):
54
- value = super().__getitem__(key)
55
- self._order.remove(key)
56
- self._order.append(key)
57
- return value
58
- def __setitem__(self, key, value):
59
- if key in self:
60
- self._order.remove(key)
61
- elif len(self._order) >= self._maxsize:
62
- oldest = self._order.pop(0)
63
- super().__delitem__(oldest)
64
- super().__setitem__(key, value)
65
- self._order.append(key)
66
- def get(self, key, default=None):
67
- if key in self:
68
- return self[key]
69
- return default
70
-
71
- class MySQLUploader:
72
- """
73
- MySQL数据上传
36
+ class DatabaseConnectionManager:
37
+ """数据库连接管理器"""
74
38
 
75
- 用于将数据上传到MySQL数据库,支持自动建表、分表、数据验证等功能。
76
- 使用连接池管理数据库连接。
77
- """
78
- def __init__(
79
- self,
80
- username: str,
81
- password: str,
82
- host: str = 'localhost',
83
- port: int = 3306,
84
- charset: str = 'utf8mb4',
85
- collation: str = 'utf8mb4_0900_ai_ci',
86
- max_retries: int = 10,
87
- retry_waiting_time: int = 10,
88
- pool_size: int = 5,
89
- mincached: int = 0,
90
- maxcached: int = 0,
91
- connect_timeout: int = 10,
92
- read_timeout: int = 30,
93
- write_timeout: int = 30,
94
- ssl: Optional[Dict] = None
95
- ):
96
- """
97
- 初始化MySQL上传器
98
-
99
- :param username: 数据库用户名
100
- :param password: 数据库密码
101
- :param host: 数据库主机地址,默认为localhost
102
- :param port: 数据库端口,默认为3306
103
- :param charset: 字符集,默认为utf8mb4
104
- :param collation: 排序规则,默认为utf8mb4_0900_ai_ci,对大小写不敏感,utf8mb4_0900_as_cs/utf8mb4_bin: 对大小写敏感
105
- :param max_retries: 最大重试次数,默认为10
106
- :param retry_waiting_time: 重试间隔(秒),默认为10
107
- :param pool_size: 连接池大小,默认为5
108
- :param mincached: 空闲连接数量
109
- :param maxcached: 最大空闲连接数, 0表示不设上限, 由连接池自动管理
110
- :param connect_timeout: 连接超时(秒),默认为10
111
- :param read_timeout: 读取超时(秒),默认为30
112
- :param write_timeout: 写入超时(秒),默认为30
113
- :param ssl: SSL配置字典,默认为None
114
- :param auto_creat_missing_cols: 自动添加缺失列,默认为False,建议手动维护表结构
115
- """
116
- self.username = username
117
- self.password = password
118
- self.host = host
119
- self.port = int(port)
120
- self.charset = charset
121
- self.collation = collation
122
- self.max_retries = max(max_retries, 1)
123
- self.retry_waiting_time = max(retry_waiting_time, 1)
124
- self.pool_size = max(pool_size, 1)
125
- self.mincached = mincached
126
- self.maxcached = maxcached
127
- self.connect_timeout = connect_timeout
128
- self.read_timeout = read_timeout
129
- self.write_timeout = write_timeout
130
- self.base_excute_col = ['id', '更新时间'] # 排重插入数据时始终排除该列
131
- self.case_sensitive = False # 是否保持大小写敏感,默认为False(转为小写)
132
- self.ssl = ssl
133
- self._prepared_statements = StatementCache(maxsize=100)
134
- self._max_cached_statements = 100 # 用于控制 StatementCache 类中缓存的 SQL 语句数量,最多缓存 100 条 SQL 语句
135
- self._table_metadata_cache = {}
136
- self.metadata_cache_ttl = 300 # 5分钟缓存时间
137
- self.pool = self._create_connection_pool() # 创建连接池
138
- self.auto_creat_missing_cols = False # 自动添加缺失列,正常不要自动添加,建议手动维护表结构
139
-
140
- def _create_connection_pool(self) -> PooledDB:
141
- """
142
- 创建数据库连接池
143
-
144
- :return: PooledDB连接池实例
145
- :raises ConnectionError: 当连接池创建失败时抛出
146
- """
147
- if hasattr(self, 'pool') and self.pool is not None and self._check_pool_health():
148
- return self.pool
39
+ def __init__(self, config: Dict[str, Any]):
40
+ self.config = config
149
41
  self.pool = None
42
+ self._create_pool()
43
+
44
+ def _create_pool(self):
45
+ """创建连接池"""
150
46
  pool_params = {
151
47
  'creator': pymysql,
152
- 'host': self.host,
153
- 'port': self.port,
154
- 'user': self.username,
155
- 'password': self.password,
156
- 'charset': self.charset,
48
+ 'host': self.config['host'],
49
+ 'port': self.config['port'],
50
+ 'user': self.config['username'],
51
+ 'password': self.config['password'],
52
+ 'charset': self.config['charset'],
157
53
  'cursorclass': pymysql.cursors.DictCursor,
158
- 'maxconnections': self.pool_size,
159
- 'mincached': self.mincached,
160
- 'maxcached': self.maxcached,
54
+ 'maxconnections': self.config['pool_size'],
55
+ 'mincached': self.config.get('mincached', 0),
56
+ 'maxcached': self.config.get('maxcached', 0),
161
57
  'ping': 7,
162
- 'connect_timeout': self.connect_timeout,
163
- 'read_timeout': self.read_timeout,
164
- 'write_timeout': self.write_timeout,
58
+ 'connect_timeout': self.config.get('connect_timeout', 10),
59
+ 'read_timeout': self.config.get('read_timeout', 30),
60
+ 'write_timeout': self.config.get('write_timeout', 30),
165
61
  'autocommit': False
166
62
  }
167
- if self.ssl:
168
- required_keys = {'ca', 'cert', 'key'}
169
- if not all(k in self.ssl for k in required_keys):
170
- error_msg = 'SSL配置必须包含ca、cert和key'
171
- logger.error(error_msg)
172
- raise ValueError(error_msg)
173
- pool_params['ssl'] = {
174
- 'ca': self.ssl['ca'],
175
- 'cert': self.ssl['cert'],
176
- 'key': self.ssl['key'],
177
- 'check_hostname': self.ssl.get('check_hostname', False)
178
- }
63
+
64
+ if self.config.get('ssl'):
65
+ pool_params['ssl'] = self.config['ssl']
66
+
179
67
  try:
180
- pool = PooledDB(**pool_params)
181
- logger.debug('连接池创建成功', {'连接池': self.pool_size, 'host': self.host, 'port': self.port})
182
- return pool
68
+ self.pool = PooledDB(**pool_params)
69
+ logger.debug('数据库连接池创建成功', {'host': self.config['host']})
183
70
  except Exception as e:
184
- self.pool = None
185
- logger.error('连接池创建失败', {'error': str(e), 'host': self.host, 'port': self.port})
71
+ logger.error('连接池创建失败', {'error': str(e)})
186
72
  raise ConnectionError(f'连接池创建失败: {str(e)}')
73
+
74
+ def get_connection(self):
75
+ """获取数据库连接"""
76
+ if not self.pool:
77
+ self._create_pool()
78
+ return self.pool.connection()
79
+
80
+ def close(self):
81
+ """关闭连接池"""
82
+ if self.pool:
83
+ self.pool = None
84
+ logger.debug('数据库连接池已关闭')
85
+
187
86
 
87
+ class DataTypeInferrer:
88
+ """数据类型推断器"""
89
+
188
90
  @staticmethod
189
- def _execute_with_retry(func):
190
- """
191
- 带重试机制的装饰器,用于数据库操作
192
- :param func: 被装饰的函数
193
- :return: 装饰后的函数
194
- :raises: 可能抛出原始异常或最后一次重试的异常
195
- """
196
- @wraps(func)
197
- def wrapper(self, *args, **kwargs):
198
- last_exception = None
199
- operation = func.__name__
200
- for attempt in range(self.max_retries):
201
- try:
202
- result = func(self, *args, **kwargs)
203
- if attempt > 0:
204
- logger.debug('操作成功(重试后)', {'operation': operation, 'attempts': attempt + 1})
205
- return result
206
- except (pymysql.OperationalError, pymysql.err.MySQLError) as e:
207
- last_exception = e
208
- error_details = {
209
- 'operation': operation,
210
- 'error_code': e.args[0] if e.args else None,
211
- 'error_message': e.args[1] if len(e.args) > 1 else None,
212
- 'attempt': attempt + 1,
213
- 'max_retries': self.max_retries
214
- }
215
- if attempt < self.max_retries - 1:
216
- wait_time = self.retry_waiting_time * (attempt + 1)
217
- error_details['wait_time'] = wait_time
218
- logger.warning('数据库操作失败,准备重试', error_details)
219
- time.sleep(wait_time)
220
- try:
221
- self.pool = self._create_connection_pool()
222
- logger.debug('成功重新建立数据库连接')
223
- except Exception as reconnect_error:
224
- logger.error('重连失败', {'error': str(reconnect_error)})
225
- else:
226
- logger.error('操作最终失败', error_details)
227
- except Exception as e:
228
- last_exception = e
229
- logger.error('发生意外错误', {
230
- 'operation': operation,
231
- 'error_type': type(e).__name__,
232
- 'error_message': str(e),
233
- 'error_args': e.args if hasattr(e, 'args') else None
234
- })
235
- break
236
- raise last_exception if last_exception else Exception('发生未知错误')
237
- return wrapper
91
+ def infer_mysql_type(value: Any) -> str:
92
+ """推断MySQL数据类型"""
93
+ if value is None or str(value).lower() in ['', 'none', 'nan']:
94
+ return 'VARCHAR(255)'
95
+
96
+ if isinstance(value, bool):
97
+ return 'TINYINT(1)'
98
+ elif isinstance(value, int):
99
+ if -2147483648 <= value <= 2147483647:
100
+ return 'INT'
101
+ else:
102
+ return 'BIGINT'
103
+ elif isinstance(value, float):
104
+ return 'DECIMAL(20,6)'
105
+ elif isinstance(value, (datetime.datetime, pd.Timestamp)):
106
+ return 'DATETIME'
107
+ elif isinstance(value, datetime.date):
108
+ return 'DATE'
109
+ elif isinstance(value, (list, dict)):
110
+ return 'JSON'
111
+ elif isinstance(value, str):
112
+ # 尝试判断是否是日期时间
113
+ if DataValidator.is_datetime_string(value):
114
+ return 'DATETIME'
115
+
116
+ # 根据字符串长度选择类型
117
+ length = len(value)
118
+ if length <= 255:
119
+ return 'VARCHAR(255)'
120
+ elif length <= 65535:
121
+ return 'TEXT'
122
+ else:
123
+ return 'LONGTEXT'
124
+
125
+ return 'VARCHAR(255)'
126
+
127
+ @staticmethod
128
+ def infer_types_from_data(data: List[Dict]) -> Dict[str, str]:
129
+ """从数据中推断所有列的类型"""
130
+ if not data:
131
+ return {}
132
+
133
+ type_map = {}
134
+ for row in data[:10]: # 只检查前10行
135
+ for col, value in row.items():
136
+ # 跳过系统列
137
+ if col.lower() in ['id', 'create_at', 'update_at']:
138
+ continue
139
+ if col not in type_map and value is not None:
140
+ type_map[col] = DataTypeInferrer.infer_mysql_type(value)
141
+
142
+ # 自动添加系统列类型定义
143
+ type_map['id'] = 'BIGINT'
144
+ type_map['create_at'] = 'TIMESTAMP'
145
+ type_map['update_at'] = 'TIMESTAMP'
146
+
147
+ return type_map
238
148
 
239
- @_execute_with_retry
240
- def _get_connection(self) -> pymysql.connections.Connection:
241
- """
242
- 从连接池获取数据库连接
243
149
 
244
- :return: 数据库连接对象
245
- :raises ConnectionError: 当获取连接失败时抛出
246
- """
247
- try:
248
- conn = self.pool.connection()
249
- return conn
250
- except Exception as e:
251
- logger.error('从连接池获取数据库连接失败,尝试重建连接池', {'error': str(e)})
252
- # 强制重建连接池
150
+ class DataValidator:
151
+ """数据验证器"""
152
+
153
+ @staticmethod
154
+ def is_datetime_string(value: str) -> bool:
155
+ """检查字符串是否为日期时间格式"""
156
+ formats = [
157
+ '%Y-%m-%d %H:%M:%S',
158
+ '%Y-%m-%d',
159
+ '%Y/%m/%d %H:%M:%S',
160
+ '%Y/%m/%d',
161
+ '%Y%m%d',
162
+ '%Y-%m-%dT%H:%M:%S',
163
+ ]
164
+
165
+ for fmt in formats:
253
166
  try:
254
- self.pool = self._create_connection_pool()
255
- conn = self.pool.connection()
256
- logger.debug('重建连接池后获取连接成功')
257
- return conn
258
- except Exception as e2:
259
- logger.error('重建连接池后依然获取连接失败', {'error': str(e2)})
260
- raise ConnectionError(f'连接数据库失败: {str(e2)}')
261
-
262
- @_execute_with_retry
263
- def _check_database_exists(self, db_name: str) -> bool:
264
- """
265
- 检查数据库是否存在
266
-
267
- :param db_name: 数据库名称
268
- :return: 存在返回True,否则返回False
269
- :raises: 可能抛出数据库相关异常
270
- """
271
- db_name = self._validate_identifier(db_name, is_database=True)
272
- sql = 'SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s'
273
- conn = None
274
- try:
275
- with self._get_connection() as conn:
276
- with conn.cursor() as cursor:
277
- cursor.execute(sql, (db_name,))
278
- exists = bool(cursor.fetchone())
279
- logger.debug('数据库存在检查', {'库': db_name, '存在': exists})
280
- return exists
281
- except Exception as e:
282
- logger.error('检查数据库是否存在时出错', {'库': db_name, '错误': str(e)})
283
- raise
284
-
285
- @_execute_with_retry
286
- def _create_database(self, db_name: str) -> None:
287
- """
288
- 创建数据库
289
-
290
- :param db_name: 要创建的数据库名称
291
- :raises: 可能抛出数据库相关异常
292
- """
293
- db_name = self._validate_identifier(db_name, is_database=True)
294
- sql = f'CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET {self.charset} COLLATE {self.collation}'
295
- conn = None
296
- try:
297
- with self._get_connection() as conn:
298
- with conn.cursor() as cursor:
299
- cursor.execute(sql)
300
- conn.commit()
301
- logger.debug('数据库已创建', {'库': db_name})
302
- except Exception as e:
303
- logger.error('无法创建数据库', {'库': db_name, '错误': str(e)})
304
- if conn is not None:
305
- conn.rollback()
306
- raise
307
-
308
- def _get_partition_table_name(self, table_name: str, date_value: str, partition_by: str) -> str:
309
- """
310
- 获取分表名称
311
-
312
- :param table_name: 基础表名
313
- :param date_value: 日期值
314
- :param partition_by: 分表方式 ('year' 或 'month' 或 'none')
315
- :return: 分表名称
316
- :raises ValueError: 如果日期格式无效或分表方式无效
317
- """
318
- try:
319
- date_obj = self._validate_datetime(value=date_value, date_type=True, no_log=False)
320
- except ValueError:
321
- logger.error('无效的日期格式', {'表': table_name, '日期值': date_value})
322
- raise ValueError(f"`{table_name}` 无效的日期格式: `{date_value}`")
323
- if partition_by == 'year':
324
- return f"{table_name}_{date_obj.year}"
325
- elif partition_by == 'month':
326
- return f"{table_name}_{date_obj.year}_{date_obj.month:02d}"
327
- else:
328
- logger.error('分表方式无效', {'表': table_name, '分表方式': partition_by})
329
- raise ValueError("分表方式必须是 'year' 或 'month' 或 'None'")
330
-
331
- def _validate_identifier(self, identifier: str, is_database: bool = False) -> str:
332
- """
333
- 验证并清理数据库标识符(表名、列名等)
334
-
335
- :param identifier: 要验证的标识符
336
- :param is_database: 是否为数据库名,数据库名不能以数字开头
337
- :return: 清理后的安全标识符
338
- :raises ValueError: 当标识符无效时抛出
339
- """
340
- if not identifier or not isinstance(identifier, str):
341
- logger.error('无效的标识符', {'标识符': identifier})
342
- raise ValueError(f"无效的标识符: `{identifier}`")
343
- # 始终做特殊字符清理
344
- cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '_', identifier)
345
- cleaned = re.sub(r'_+', '_', cleaned).strip('_')
346
- # 如果清理后为空字符串,使用默认标识符
347
- if not cleaned:
348
- logger.warning('标识符清理后为空,使用默认标识符', {'原始标识符': identifier})
349
- # 使用原始标识符的哈希值作为后缀,保持可追溯性
350
- import hashlib
351
- hash_suffix = hashlib.md5(identifier.encode('utf-8')).hexdigest()[:8]
352
- cleaned = f'unknown_col_{hash_suffix}'
353
-
354
- # 数据库名不能以数字开头(MySQL要求),但表名和列名可以
355
- if is_database and cleaned and cleaned[0].isdigit():
356
- cleaned = f'db_{cleaned}'
357
- logger.warning('为数字开头的数据库名添加db_前缀', {
358
- '原始标识符': identifier,
359
- '清理后': cleaned
360
- })
167
+ datetime.datetime.strptime(value, fmt)
168
+ return True
169
+ except ValueError:
170
+ continue
171
+ return False
172
+
173
+ @staticmethod
174
+ def validate_and_convert_value(value: Any, mysql_type: str, allow_null: bool = False) -> Any:
175
+ """验证并转换数据值"""
176
+ mysql_type_lower = mysql_type.lower()
177
+
178
+ # 处理空值
179
+ if value is None or (isinstance(value, str) and value.strip() == ''):
180
+ if allow_null:
181
+ return None
182
+ # 对于日期时间类型,直接返回默认的日期时间值
183
+ if 'datetime' in mysql_type_lower or 'timestamp' in mysql_type_lower:
184
+ return '2000-01-01 00:00:00'
185
+ elif 'date' in mysql_type_lower:
186
+ return '2000-01-01'
187
+ return DataValidator._get_default_value(mysql_type)
188
+
189
+ # 处理pandas的NaN值
190
+ if not isinstance(value, (list, dict)):
191
+ try:
192
+ if pd.isna(value) or (isinstance(value, float) and math.isinf(value)):
193
+ if allow_null:
194
+ return None
195
+ # 对于日期时间类型,直接返回默认的日期时间值
196
+ if 'datetime' in mysql_type_lower or 'timestamp' in mysql_type_lower:
197
+ return '2000-01-01 00:00:00'
198
+ elif 'date' in mysql_type_lower:
199
+ return '2000-01-01'
200
+ return DataValidator._get_default_value(mysql_type)
201
+ except (ValueError, TypeError):
202
+ pass
361
203
 
362
- mysql_keywords = {
363
- 'select', 'insert', 'update', 'delete', 'from', 'where', 'and', 'or',
364
- 'not', 'like', 'in', 'is', 'null', 'true', 'false', 'between'
365
- }
366
- if len(cleaned) > 64:
367
- cleaned = cleaned[:64]
368
- if cleaned.lower() in mysql_keywords:
369
- logger.debug('存在MySQL保留字', {'标识符': cleaned})
370
- return f"`{cleaned}`"
371
- return cleaned
372
-
373
- @_execute_with_retry
374
- def _check_table_exists(self, db_name: str, table_name: str) -> bool:
375
- """
376
- 检查表是否存在
377
-
378
- :param db_name: 数据库名
379
- :param table_name: 表名
380
- :return: 存在返回True,否则返回False
381
- :raises: 可能抛出数据库相关异常
382
- """
383
- cache_key = f"{db_name}.{table_name}"
384
- if cache_key in self._table_metadata_cache:
385
- cached_time, result = self._table_metadata_cache[cache_key]
386
- if time.time() - cached_time < self.metadata_cache_ttl:
387
- logger.debug('表存在缓存命中', {'库': db_name, '表': table_name, '存在': result})
388
- return result
389
- db_name = self._validate_identifier(db_name, is_database=True)
390
- table_name = self._validate_identifier(table_name)
391
- sql = """
392
- SELECT TABLE_NAME
393
- FROM INFORMATION_SCHEMA.TABLES
394
- WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
395
- """
396
- try:
397
- with self._get_connection() as conn:
398
- with conn.cursor() as cursor:
399
- cursor.execute(sql, (db_name, table_name))
400
- result = bool(cursor.fetchone())
401
- except Exception as e:
402
- logger.error('检查数据表是否存在时发生未知错误', {'库': db_name, '表': table_name, '错误': str(e)})
403
- raise
404
- self._table_metadata_cache[cache_key] = (time.time(), result)
405
- logger.debug('表存在检查', {'库': db_name, '表': table_name, '存在': result})
406
- return result
407
-
408
- @_execute_with_retry
409
- def _create_table(
410
- self,
411
- db_name: str,
412
- table_name: str,
413
- set_typ: Dict[str, str],
414
- primary_keys: Optional[List[str]] = None,
415
- date_column: Optional[str] = None,
416
- indexes: Optional[List[str]] = None,
417
- allow_null: bool = False,
418
- unique_keys: Optional[List[List[str]]] = None
419
- ) -> None:
420
- """
421
- 创建数据表,优化索引创建方式
422
- """
423
- db_name = self._validate_identifier(db_name, is_database=True)
424
- table_name = self._validate_identifier(table_name)
425
- if not set_typ:
426
- logger.error('建表时未指定set_typ', {'库': db_name, '表': table_name})
427
- raise ValueError('set_typ 未指定')
428
- # set_typ的键清洗
429
- set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
430
-
431
- # 处理id列和主键
432
- column_defs = []
204
+ # JSON类型
205
+ if 'json' in mysql_type_lower:
206
+ if isinstance(value, (dict, list)):
207
+ return json.dumps(value, ensure_ascii=False)
208
+ elif isinstance(value, str):
209
+ try:
210
+ json.loads(value)
211
+ return value
212
+ except (TypeError, ValueError):
213
+ raise ValueError(f"无效的JSON字符串: {value}")
214
+ else:
215
+ return str(value)
433
216
 
434
- # 添加id列(仅在没有指定主键时)
435
- if not primary_keys:
436
- column_defs.append("`id` INT NOT NULL AUTO_INCREMENT")
217
+ # 日期时间类型
218
+ if 'datetime' in mysql_type_lower or 'timestamp' in mysql_type_lower:
219
+ return DataValidator._convert_to_datetime(value)
220
+ elif 'date' in mysql_type_lower:
221
+ return DataValidator._convert_to_date(value)
437
222
 
438
- # 添加其他列,确保时间戳字段按正确顺序添加
439
- timestamp_cols = ['创建时间', '更新时间']
440
- regular_cols = []
441
- timestamp_defs = []
223
+ # 数值类型
224
+ elif 'int' in mysql_type_lower:
225
+ return DataValidator._convert_to_int(value)
226
+ elif any(t in mysql_type_lower for t in ['decimal', 'float', 'double']):
227
+ return DataValidator._convert_to_decimal(value)
442
228
 
443
- # 先处理非时间戳字段
444
- for col_name, col_type in set_typ.items():
445
- if col_name == 'id':
446
- continue
447
- if col_name in timestamp_cols:
448
- continue # 时间戳字段稍后按顺序处理
449
-
450
- safe_col_name = self._normalize_col(col_name)
451
- col_def = f"`{safe_col_name}` {col_type}"
452
- if not allow_null and not col_type.lower().startswith('json') and not col_type.lower().startswith('timestamp'):
453
- col_def += " NOT NULL"
454
- regular_cols.append(col_def)
455
-
456
- # 按固定顺序添加时间戳字段
457
- for timestamp_col in timestamp_cols:
458
- if timestamp_col in set_typ:
459
- safe_col_name = self._normalize_col(timestamp_col)
460
- col_type = set_typ[timestamp_col]
461
- col_def = f"`{safe_col_name}` {col_type}"
462
- # TIMESTAMP字段不需要额外的NOT NULL,因为已经包含在类型定义中
463
- timestamp_defs.append(col_def)
464
-
465
- # 合并所有列定义:常规字段 + 时间戳字段
466
- column_defs.extend(regular_cols)
467
- column_defs.extend(timestamp_defs)
468
-
469
- # 主键处理逻辑调整
470
- def _index_col_sql(col):
471
- col_type = set_typ.get(col, '').lower()
472
- if 'varchar' in col_type:
473
- m = re.search(r'varchar\((\d+)\)', col_type)
474
- if m:
475
- maxlen = int(m.group(1))
476
- prefix_len = min(100, maxlen)
477
- return f"`{self._normalize_col(col)}`({prefix_len})"
478
- else:
479
- return f"`{self._normalize_col(col)}`(100)"
480
- elif 'text' in col_type:
481
- return f"`{self._normalize_col(col)}`(100)"
482
- else:
483
- return f"`{self._normalize_col(col)}`"
484
-
485
- # 处理主键
486
- if primary_keys and len(primary_keys) > 0:
487
- # 验证主键列是否存在于set_typ中
488
- valid_primary_keys = []
489
- for pk in primary_keys:
490
- normalized_pk = self._normalize_col(pk)
491
- if normalized_pk in set_typ:
492
- valid_primary_keys.append(pk)
493
- else:
494
- logger.warning('主键列不存在于表结构中,跳过', {
495
- '库': db_name,
496
- '表': table_name,
497
- '列': pk,
498
- '规范化后': normalized_pk,
499
- '可用列': list(set_typ.keys())
500
- })
501
-
502
- if valid_primary_keys:
503
- # 如果指定了主键,直接使用指定的主键
504
- safe_primary_keys = [_index_col_sql(pk) for pk in valid_primary_keys]
505
- primary_key_sql = f"PRIMARY KEY ({','.join(safe_primary_keys)})"
229
+ # 字符串类型
230
+ elif 'varchar' in mysql_type_lower:
231
+ str_value = str(value)
232
+ # 检查长度限制
233
+ match = re.search(r'\((\d+)\)', mysql_type)
234
+ if match:
235
+ max_len = int(match.group(1))
236
+ if len(str_value.encode('utf-8')) > max_len:
237
+ return str_value.encode('utf-8')[:max_len].decode('utf-8', 'ignore')
238
+ return str_value
239
+
240
+ # 默认转为字符串
241
+ return str(value)
242
+
243
+ @staticmethod
244
+ def _get_default_value(mysql_type: str) -> Any:
245
+ """获取MySQL类型的默认值"""
246
+ mysql_type_lower = mysql_type.lower()
247
+
248
+ if any(t in mysql_type_lower for t in ['int', 'bigint', 'tinyint', 'smallint']):
249
+ return 0
250
+ elif any(t in mysql_type_lower for t in ['decimal', 'float', 'double']):
251
+ return 0.0
252
+ elif any(t in mysql_type_lower for t in ['varchar', 'text', 'char']):
253
+ return 'none'
254
+ elif 'date' in mysql_type_lower:
255
+ if 'datetime' in mysql_type_lower:
256
+ return '2000-01-01 00:00:00'
506
257
  else:
507
- # 如果没有有效的主键,使用id作为主键
508
- logger.warning('所有主键列都不存在于表结构中,使用默认id主键', {
509
- '': db_name,
510
- '表': table_name,
511
- '原始主键': primary_keys
512
- })
513
- primary_key_sql = f"PRIMARY KEY (`id`)"
258
+ return '2000-01-01'
259
+ elif 'json' in mysql_type_lower:
260
+ return '{}'
514
261
  else:
515
- # 如果没有指定主键,使用id作为主键
516
- primary_key_sql = f"PRIMARY KEY (`id`)"
517
-
518
- # 索引统一在CREATE TABLE中定义
519
- index_defs = []
520
- if date_column and date_column in set_typ:
521
- safe_date_col = _index_col_sql(date_column)
522
- index_defs.append(f"INDEX `idx_{self._normalize_col(date_column)}` ({safe_date_col})")
523
-
524
- # 收集所有唯一约束中涉及的列,避免重复创建普通索引
525
- unique_columns = set()
526
- if unique_keys:
527
- for unique_cols in unique_keys:
528
- if unique_cols:
529
- for col in unique_cols:
530
- normalized_col = self._normalize_col(col)
531
- if normalized_col in set_typ:
532
- unique_columns.add(normalized_col)
533
-
534
- if indexes:
535
- for idx_col in indexes:
536
- normalized_idx_col = self._normalize_col(idx_col)
537
- if normalized_idx_col in set_typ:
538
- # 检查是否与唯一约束冲突
539
- if normalized_idx_col in unique_columns:
540
- logger.warning('索引列已在唯一约束中定义,跳过普通索引', {
541
- '库': db_name,
542
- '表': table_name,
543
- '列': idx_col,
544
- '原因': '列已在唯一约束中定义'
545
- })
546
- continue
547
- safe_idx_col = _index_col_sql(idx_col)
548
- index_defs.append(f"INDEX `idx_{normalized_idx_col}` ({safe_idx_col})")
549
- else:
550
- logger.warning('索引列不存在于表结构中,跳过', {
551
- '库': db_name,
552
- '表': table_name,
553
- '列': idx_col,
554
- '规范化后': normalized_idx_col,
555
- '可用列': list(set_typ.keys())
556
- })
557
-
558
- # UNIQUE KEY定义
559
- unique_defs = []
560
- if unique_keys:
561
- for unique_cols in unique_keys:
562
- if not unique_cols:
563
- continue
564
- # 检查唯一约束是否与主键冲突
565
- if primary_keys:
566
- # 如果唯一约束的列是主键的一部分,则跳过
567
- if set(unique_cols).issubset(set(primary_keys)):
568
- logger.warning('跳过与主键冲突的唯一约束', {
569
- '库': db_name,
570
- '表': table_name,
571
- '唯一约束': unique_cols,
572
- '主键': primary_keys
573
- })
574
- continue
575
-
576
- # 验证唯一约束的列是否存在于set_typ中
577
- valid_unique_cols = []
578
- for col in unique_cols:
579
- normalized_col = self._normalize_col(col)
580
- if normalized_col in set_typ:
581
- valid_unique_cols.append(col)
582
- else:
583
- logger.warning('唯一约束列不存在于表结构中,跳过', {
584
- '库': db_name,
585
- '表': table_name,
586
- '列': col,
587
- '规范化后': normalized_col,
588
- '可用列': list(set_typ.keys())
589
- })
590
-
591
- if not valid_unique_cols:
592
- logger.warning('唯一约束的所有列都不存在于表结构中,跳过整个约束', {
593
- '库': db_name,
594
- '表': table_name,
595
- '原始约束': unique_cols
596
- })
597
- continue
598
-
599
- safe_unique_cols = [_index_col_sql(col) for col in valid_unique_cols]
600
- unique_name = f"uniq_{'_'.join([self._normalize_col(c) for c in valid_unique_cols])}"
601
- unique_defs.append(f"UNIQUE KEY `{unique_name}` ({','.join(safe_unique_cols)})")
602
-
603
- index_defs = list(set(index_defs))
604
- all_defs = column_defs + [primary_key_sql] + index_defs + unique_defs
605
-
606
- # 添加调试日志
607
- logger.debug('建表SQL生成', {
608
- '库': db_name,
609
- '表': table_name,
610
- '列定义': column_defs,
611
- '主键': primary_key_sql,
612
- '索引': index_defs,
613
- '唯一约束': unique_defs,
614
- 'set_typ键': list(set_typ.keys())
615
- })
616
-
617
- sql = f"""
618
- CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
619
- {','.join(all_defs)}
620
- ) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
621
- """
622
- conn = None
623
- try:
624
- with self._get_connection() as conn:
625
- with conn.cursor() as cursor:
626
- cursor.execute(sql)
627
- conn.commit()
628
- logger.debug('数据表及索引已创建', {'库': db_name, '表': table_name, '索引': indexes, '唯一约束': unique_keys})
629
- except Exception as e:
630
- logger.error('建表失败', {'库': db_name, '表': table_name, '错误': str(e), '异常类型': type(e).__name__})
631
- if conn is not None:
632
- conn.rollback()
633
- raise
634
-
635
- def _validate_datetime(self, value: str, date_type: bool = False, no_log: bool = False) -> Any:
636
- """
637
- 验证并标准化日期时间格式
638
-
639
- :param value: 日期时间值
640
- :param date_type: 是否返回日期类型(True)或字符串(False)
641
- :param no_log: 记录日志,默认为False
642
- :return: 标准化后的日期时间字符串或日期对象
643
- :raises ValueError: 当日期格式无效时抛出
644
- """
645
- # 处理 pandas Timestamp 对象
262
+ return 'none'
263
+
264
+ @staticmethod
265
+ def _convert_to_datetime(value: Any) -> str:
266
+ """转换为datetime格式"""
646
267
  if hasattr(value, 'strftime'):
647
- # 如果是 Timestamp 或 datetime 对象,直接格式化
648
- if date_type:
649
- return pd.to_datetime(value.strftime('%Y-%m-%d'))
650
- else:
651
- return value.strftime('%Y-%m-%d %H:%M:%S')
268
+ return value.strftime('%Y-%m-%d %H:%M:%S')
269
+
270
+ value_str = str(value).strip()
271
+
272
+ # 处理特殊的无效值
273
+ if value_str.lower() in ['none', 'null', 'nan', '', 'nat']:
274
+ return '2000-01-01 00:00:00'
652
275
 
653
- # 确保 value 是字符串
654
- if not isinstance(value, str):
655
- value = str(value)
656
-
657
276
  formats = [
658
277
  '%Y-%m-%d %H:%M:%S',
659
278
  '%Y-%m-%d',
@@ -661,2251 +280,607 @@ class MySQLUploader:
661
280
  '%Y/%m/%d',
662
281
  '%Y%m%d',
663
282
  '%Y-%m-%dT%H:%M:%S',
664
- '%Y-%m-%d %H:%M:%S.%f',
665
- '%Y/%-m/%-d', # 2023/1/8
666
- '%Y-%-m-%-d', # 2023-01-8
667
283
  ]
284
+
668
285
  for fmt in formats:
669
286
  try:
670
- if date_type:
671
- result = pd.to_datetime(datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d'))
672
- return result
673
- else:
674
- result = datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d %H:%M:%S')
675
- return result
287
+ dt = datetime.datetime.strptime(value_str, fmt)
288
+ return dt.strftime('%Y-%m-%d %H:%M:%S')
676
289
  except ValueError:
677
290
  continue
678
- if not no_log:
679
- logger.error('无效的日期格式', {'值': value})
680
- raise ValueError(f"无效的日期格式: `{value}`")
681
-
682
- def _get_fallback_value(self, column_type_lower: str, allow_null: bool, db_name: str = None, table_name: str = None, col_name: str = None, original_value: Any = None) -> Any:
683
- """
684
- 获取空值的兜底填充值
685
- """
686
- # 兜底填充值映射
687
- fallback_map = {
688
- 'int': 0,
689
- 'bigint': 0,
690
- 'tinyint': 0,
691
- 'smallint': 0,
692
- 'mediumint': 0,
693
- 'decimal': 0.0,
694
- 'float': 0.0,
695
- 'double': 0.0,
696
- 'date': '2000-01-01',
697
- 'datetime': '2000-01-01 00:00:00',
698
- 'timestamp': '2000-01-01 00:00:00',
699
- 'json': '{}',
700
- 'varchar': 'none',
701
- 'text': 'none',
702
- 'char': 'none',
703
- 'mediumtext': 'none',
704
- 'longtext': 'none',
705
- 'enum': None, # enum类型需要特殊处理,使用第一个可选值
706
- 'set': '', # set类型默认为空字符串
707
- }
708
291
 
709
- fallback = 'none'
710
- for typ, val in fallback_map.items():
711
- if typ in column_type_lower:
712
- if typ == 'enum' and val is None:
713
- # 对于enum类型,使用第一个可选值作为默认值
714
- enum_values = re.findall(r"['\"]([^'\"]*)['\"]", column_type_lower)
715
- fallback = enum_values[0] if enum_values else 'none'
716
- else:
717
- fallback = val
718
- break
719
-
720
- if not allow_null:
721
- # 注释掉,这里可能会产生大量日志
722
- # logger.debug("该列不允许为空值", {
723
- # "库": db_name,
724
- # "": table_name,
725
- # "allow_null": allow_null,
726
- # "列": col_name,
727
- # "值": original_value,
728
- # "兜底值": fallback
729
- # })
730
- return fallback # 直接返回兜底值
731
-
732
- return None # 允许空值时返回None
733
-
734
- def _convert_to_int(self, value):
735
- """
736
- 尝试将value转换为int
737
- """
738
- # 处理numpy/pandas标量
739
- if hasattr(value, 'item') and callable(getattr(value, 'item', None)):
292
+ # 如果所有格式都无法解析,返回默认值而不是抛出异常
293
+ return '2000-01-01 00:00:00'
294
+
295
+ @staticmethod
296
+ def _convert_to_date(value: Any) -> str:
297
+ """转换为date格式"""
298
+ if hasattr(value, 'strftime'):
299
+ return value.strftime('%Y-%m-%d')
300
+
301
+ # 先转为datetime再提取日期部分
302
+ datetime_str = DataValidator._convert_to_datetime(value)
303
+ return datetime_str.split(' ')[0]
304
+
305
+ @staticmethod
306
+ def _convert_to_int(value: Any) -> int:
307
+ """转换为整数"""
308
+ if hasattr(value, 'item'):
740
309
  try:
741
310
  value = value.item()
742
311
  except Exception:
743
312
  pass
744
- elif hasattr(value, 'value') and not isinstance(value, str):
745
- try:
746
- extracted_value = value.value
747
- if isinstance(extracted_value, (int, float, str)) and str(extracted_value).replace('.', '').replace('-', '').isdigit():
748
- value = extracted_value
749
- except Exception:
750
- pass
313
+
751
314
  try:
752
315
  return int(value)
753
316
  except (ValueError, TypeError):
754
317
  try:
755
318
  return int(float(value))
756
319
  except (ValueError, TypeError):
757
- raise
758
-
759
- def _convert_to_float(self, value):
760
- """
761
- 尝试将value转换为float,兼容常见数值类型。
762
- """
763
- if hasattr(value, 'item') and callable(getattr(value, 'item', None)):
320
+ raise ValueError(f"无法转换为整数: {value}")
321
+
322
+ @staticmethod
323
+ def _convert_to_decimal(value: Any) -> Decimal:
324
+ """转换为Decimal"""
325
+ if hasattr(value, 'item'):
764
326
  try:
765
327
  value = value.item()
766
328
  except Exception:
767
329
  pass
768
- elif hasattr(value, 'value') and not isinstance(value, str):
769
- try:
770
- extracted_value = value.value
771
- if isinstance(extracted_value, (int, float, str)) and str(extracted_value).replace('.', '').replace('-', '').replace('e', '').replace('E', '').isdigit():
772
- value = extracted_value
773
- except Exception:
774
- pass
775
- return float(value)
330
+
331
+ # 处理百分比字符串
332
+ if isinstance(value, str) and '%' in value:
333
+ if re.match(r'^-?\d+(\.\d+)?%$', value.strip()):
334
+ value = float(value.strip().replace('%', '')) / 100
335
+
336
+ try:
337
+ return Decimal(str(value))
338
+ except (ValueError, TypeError, InvalidOperation):
339
+ raise ValueError(f"无法转换为数值: {value}")
776
340
 
777
- def _convert_to_decimal(self, value):
778
- """
779
- 尝试将value转换为Decimal,兼容常见数值类型。
780
- """
781
- if hasattr(value, 'item') and callable(getattr(value, 'item', None)):
782
- try:
783
- value = value.item()
784
- except Exception:
785
- pass
786
- elif hasattr(value, 'value') and not isinstance(value, str):
787
- try:
788
- extracted_value = value.value
789
- if isinstance(extracted_value, (int, float, str)) and str(extracted_value).replace('.', '').replace('-', '').replace('e', '').replace('E', '').isdigit():
790
- value = extracted_value
791
- except Exception:
792
- pass
793
- return Decimal(str(value))
794
341
 
795
- def _truncate_str(self, str_value, max_len):
796
- """
797
- 截断字符串到指定字节长度(utf-8)。
798
- """
799
- return str_value.encode('utf-8')[:max_len].decode('utf-8', 'ignore')
800
-
801
- def _validate_value(self, value: Any, column_type: str, allow_null: bool, db_name: str = None, table_name: str = None, col_name: str = None) -> Any:
802
- """
803
- 根据列类型验证并转换数据值
804
- """
805
- column_type_lower = column_type.lower() if column_type else ''
806
-
807
- # 对于包含CURRENT_TIMESTAMP的TIMESTAMP字段,跳过验证,让MySQL自动处理
808
- if ('timestamp' in column_type_lower and 'current_timestamp' in column_type_lower and
809
- col_name in ['创建时间', '更新时间']):
810
- # 这些字段由MySQL自动处理,不需要传入值
811
- return None
812
-
813
- # 统一的空值检查(None、空字符串、NaN)
814
- is_empty_value = False
815
- if value is None:
816
- is_empty_value = True
817
- elif value == '':
818
- # 空字符串对于字符串类型是有效值
819
- if any(t in column_type_lower for t in ['varchar', 'text', 'char', 'mediumtext', 'longtext']):
820
- return ""
821
- is_empty_value = True
822
- else:
823
- # 检查NaN值(避免对列表和字典使用pd.isna)
824
- if not isinstance(value, (list, dict)):
825
- try:
826
- is_empty_value = pd.isna(value) or (isinstance(value, (float, Decimal)) and math.isinf(value))
827
- except (ValueError, TypeError):
828
- is_empty_value = False
829
-
830
- # 统一处理空值
831
- if is_empty_value:
832
- fallback_value = self._get_fallback_value(column_type_lower, allow_null, db_name, table_name, col_name, value)
833
- # 如果返回了兜底值(非None),直接返回,不再进行后续验证
834
- # 因为兜底值已经是根据列类型设计的合适值
835
- if fallback_value is not None:
836
- return fallback_value
837
- # 如果返回None(允许空值的情况),继续后续处理
838
- return None
839
-
840
- # JSON类型验证和转换
841
- if 'json' in column_type_lower:
842
- if isinstance(value, (dict, list)):
843
- try:
844
- return json.dumps(value, ensure_ascii=False)
845
- except (TypeError, ValueError) as e:
846
- logger.error(f"JSON序列化失败: {e}", {"库": db_name, "表": table_name, "列": col_name, "值": value})
847
- raise ValueError(f"JSON序列化失败: {e}")
848
- elif isinstance(value, str):
849
- # 验证字符串是否为有效的JSON
850
- try:
851
- json.loads(value)
852
- return value
853
- except (TypeError, ValueError) as e:
854
- logger.error(f"无效的JSON字符串: {e}", {"库": db_name, "表": table_name, "列": col_name, "值": value})
855
- raise ValueError(f"无效的JSON字符串: {e}")
856
- else:
857
- # 其他类型转换为字符串
858
- return str(value)
859
-
860
- original_value = value
861
-
862
- # 日期时间类型验证
863
- if 'datetime' in column_type_lower or 'timestamp' in column_type_lower:
864
- return self._validate_datetime(value, date_type=False, no_log=True)
865
- elif 'date' in column_type_lower:
866
- return self._validate_datetime(value, date_type=True, no_log=True)
867
- # 数值类型验证
868
- elif 'int' in column_type_lower:
869
- try:
870
- return self._convert_to_int(value)
871
- except (ValueError, TypeError):
872
- logger.error(f"值 `{value}` 无法转换为整数", {"库": db_name, "表": table_name, "列": col_name})
873
- raise ValueError(f"值 `{value}` 无法转换为整数")
874
- elif any(t in column_type_lower for t in ['decimal', 'float', 'double']):
875
- # 百分比字符串处理
876
- if isinstance(value, str) and '%' in value:
877
- try:
878
- if re.match(r'^-?\d+(\.\d+)?%$', value.strip()):
879
- value = float(value.strip().replace('%', '')) / 100
880
- else:
881
- logger.warning("百分比字符串不符合格式,跳过转换", {"库": db_name, "表": table_name, "列": col_name, "原始": original_value})
882
- value = original_value
883
- except (ValueError, TypeError):
884
- logger.warning("百分比字符串转换失败,保留原始值", {"库": db_name, "表": table_name, "列": col_name, "原始": original_value})
885
- value = original_value
886
- try:
887
- if 'decimal' in column_type_lower:
888
- precision, scale = self._get_decimal_scale(column_type)
889
- value_decimal = self._convert_to_decimal(value)
890
- if len(value_decimal.as_tuple().digits) - abs(value_decimal.as_tuple().exponent) > precision - scale:
891
- raise ValueError(f"整数部分超出范围")
892
- return value_decimal
893
- else: # float/double
894
- return self._convert_to_float(value)
895
- except (ValueError, TypeError, InvalidOperation) as e:
896
- logger.error(f"值 `{value}` 无法转换为数值类型: {e}", {"库": db_name, "表": table_name, "列": col_name})
897
- raise ValueError(f"值 `{value}` 无法转换为数值类型: {e}")
898
- # ENUM类型验证
899
- elif 'enum' in column_type_lower:
900
- # 提取enum的可选值,支持单引号和双引号
901
- enum_values = re.findall(r"['\"]([^'\"]*)['\"]", column_type)
902
- str_value = str(value).strip()
903
- if str_value not in enum_values:
904
- logger.error(f"值 `{str_value}` 不在enum允许的值中: {enum_values}",
905
- {"库": db_name, "表": table_name, "列": col_name, "列类型": column_type})
906
- raise ValueError(f"值 `{str_value}` 不在enum允许的值中: {enum_values}")
907
- return str_value
908
- # SET类型验证
909
- elif 'set' in column_type_lower:
910
- # 提取set的可选值,支持单引号和双引号
911
- set_values = re.findall(r"['\"]([^'\"]*)['\"]", column_type)
912
- str_value = str(value).strip()
913
- # SET类型可以是多个值的组合,用逗号分隔
914
- if ',' in str_value:
915
- input_values = [v.strip() for v in str_value.split(',')]
916
- else:
917
- input_values = [str_value]
918
-
919
- for val in input_values:
920
- if val and val not in set_values:
921
- logger.error(f"值 `{val}` 不在set允许的值中: {set_values}",
922
- {"库": db_name, "表": table_name, "列": col_name, "列类型": column_type})
923
- raise ValueError(f"值 `{val}` 不在set允许的值中: {set_values}")
924
- return str_value
925
- # 字符串类型验证
926
- elif 'varchar' in column_type_lower:
927
- str_value = str(value)
928
- try:
929
- max_len = int(re.search(r'\((\d+)\)', column_type).group(1))
930
- if len(str_value.encode('utf-8')) > max_len:
931
- logger.warning(f"列`{col_name}`的值`{str_value}`长度({len(str_value.encode('utf-8'))})超出varchar({max_len})限制,将进行截断", {"库": db_name, "表": table_name})
932
- return self._truncate_str(str_value, max_len)
933
- except (AttributeError, IndexError):
934
- pass
935
- return str_value
342
+ class TableManager:
343
+ """表管理器"""
344
+
345
+ def __init__(self, connection_manager: DatabaseConnectionManager, collation: str):
346
+ self.conn_mgr = connection_manager
347
+ self.collation = collation
348
+
349
+ def ensure_database_exists(self, db_name: str):
350
+ """确保数据库存在"""
351
+ db_name = self._sanitize_identifier(db_name)
936
352
 
937
- # 兜底处理:确保所有返回值都是基本数据类型
938
- if isinstance(value, (dict, list)):
939
- try:
940
- return json.dumps(value, ensure_ascii=False)
941
- except (TypeError, ValueError):
942
- return str(value)
943
- else:
944
- return str(value)
945
-
946
- @_execute_with_retry
947
- def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
948
- """
949
- 获取表的列名和数据类型
950
-
951
- :param db_name: 数据库名
952
- :param table_name: 表名
953
- :return: 列名和数据类型字典 {列名: 数据类型}
954
- :raises: 可能抛出数据库相关异常
955
- """
956
- db_name = self._validate_identifier(db_name, is_database=True)
957
- table_name = self._validate_identifier(table_name)
958
- sql = """
959
- SELECT COLUMN_NAME, DATA_TYPE
960
- FROM INFORMATION_SCHEMA.COLUMNS
961
- WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
962
- ORDER BY ORDINAL_POSITION
963
- """
964
- try:
965
- with self._get_connection() as conn:
966
- with conn.cursor() as cursor:
967
- cursor.execute(sql, (db_name, table_name))
968
- if self.case_sensitive:
969
- set_typ = {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
970
- else:
971
- set_typ = {row['COLUMN_NAME'].lower(): row['DATA_TYPE'] for row in cursor.fetchall()}
972
- logger.debug('获取表的列信息', {'库': db_name, '表': table_name, '列信息': set_typ})
973
- return set_typ
974
- except Exception as e:
975
- logger.error('无法获取表列信息', {'库': db_name, '表': table_name, '错误': str(e)})
976
- raise
977
-
978
- def _ensure_index(self, db_name: str, table_name: str, column: str):
979
- """
980
- 确保某列有索引,如果没有则创建。
353
+ with self.conn_mgr.get_connection() as conn:
354
+ with conn.cursor() as cursor:
355
+ cursor.execute(
356
+ "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s",
357
+ (db_name,)
358
+ )
359
+ if not cursor.fetchone():
360
+ charset = self.conn_mgr.config['charset']
361
+ sql = f"CREATE DATABASE `{db_name}` CHARACTER SET {charset} COLLATE {self.collation}"
362
+ cursor.execute(sql)
363
+ conn.commit()
364
+ logger.debug('数据库已创建', {'database': db_name})
365
+
366
+ def table_exists(self, db_name: str, table_name: str) -> bool:
367
+ """检查表是否存在"""
368
+ db_name = self._sanitize_identifier(db_name)
369
+ table_name = self._sanitize_identifier(table_name)
370
+
371
+ with self.conn_mgr.get_connection() as conn:
372
+ with conn.cursor() as cursor:
373
+ cursor.execute(
374
+ "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s",
375
+ (db_name, table_name)
376
+ )
377
+ return bool(cursor.fetchone())
378
+
379
+ def create_table(self, db_name: str, table_name: str, columns: Dict[str, str],
380
+ primary_keys: Optional[List[str]] = None,
381
+ unique_keys: Optional[List[List[str]]] = None):
382
+ """创建表"""
383
+ db_name = self._sanitize_identifier(db_name)
384
+ table_name = self._sanitize_identifier(table_name)
385
+
386
+ # 构建列定义
387
+ column_defs = []
388
+
389
+ # 始终添加自增ID列作为主键
390
+ column_defs.append("`id` BIGINT NOT NULL AUTO_INCREMENT")
391
+
392
+ # 添加业务列
393
+ for col_name, col_type in columns.items():
394
+ if col_name.lower() in ['id', 'create_at', 'update_at']:
395
+ continue
396
+ safe_col_name = self._sanitize_identifier(col_name)
397
+ column_defs.append(f"`{safe_col_name}` {col_type} NOT NULL")
398
+
399
+ # 添加时间戳列
400
+ column_defs.append("`create_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP")
401
+ column_defs.append("`update_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")
402
+
403
+ # 主键定义(始终使用id作为主键)
404
+ primary_key_def = "PRIMARY KEY (`id`)"
405
+
406
+ # 唯一约束定义
407
+ unique_defs = []
408
+ if unique_keys:
409
+ for i, uk in enumerate(unique_keys):
410
+ # 过滤掉系统列
411
+ filtered_uk = [col for col in uk if col.lower() not in ['id', 'create_at', 'update_at']]
412
+ if filtered_uk:
413
+ safe_uk = [f"`{self._sanitize_identifier(col)}`" for col in filtered_uk]
414
+ unique_name = f"uniq_{i}"
415
+ unique_defs.append(f"UNIQUE KEY `{unique_name}` ({','.join(safe_uk)})")
416
+
417
+ # 组合所有定义
418
+ all_defs = column_defs + [primary_key_def] + unique_defs
419
+
420
+ charset = self.conn_mgr.config['charset']
421
+ sql = f"""
422
+ CREATE TABLE `{db_name}`.`{table_name}` (
423
+ {','.join(all_defs)}
424
+ ) ENGINE=InnoDB DEFAULT CHARSET={charset} COLLATE={self.collation}
981
425
  """
982
- db_name = self._validate_identifier(db_name, is_database=True)
983
- table_name = self._validate_identifier(table_name)
984
- column = self._validate_identifier(column)
985
- # 检查索引是否已存在
986
- sql_check = '''
987
- SELECT COUNT(1) FROM INFORMATION_SCHEMA.STATISTICS
988
- WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND COLUMN_NAME = %s
989
- '''
990
- sql_create = f'ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{column}` (`{column}`)'
991
- try:
992
- with self._get_connection() as conn:
993
- with conn.cursor() as cursor:
994
- cursor.execute(sql_check, (db_name, table_name, column))
995
- exists = cursor.fetchone()
996
- if exists and list(exists.values())[0] > 0:
997
- logger.debug('索引检查', {'库': db_name, '表': table_name, '索引列': column})
998
- return
999
- cursor.execute(sql_create)
426
+
427
+ with self.conn_mgr.get_connection() as conn:
428
+ with conn.cursor() as cursor:
429
+ cursor.execute(sql)
1000
430
  conn.commit()
1001
- logger.debug('已为列创建索引', {'': db_name, '': table_name, '列': column})
1002
- except Exception as e:
1003
- logger.error('创建索引失败', {'库': db_name, '表': table_name, '列': column, '错误': str(e)})
1004
- raise
1005
-
1006
- def _get_existing_unique_keys(self, db_name: str, table_name: str) -> List[List[str]]:
1007
- """
1008
- 获取表中所有UNIQUE KEY的列组合(不含主键)。
1009
- 返回:[[col1, col2], ...]
1010
- """
1011
- db_name = self._validate_identifier(db_name, is_database=True)
1012
- table_name = self._validate_identifier(table_name)
1013
- sql = '''
1014
- SELECT INDEX_NAME, COLUMN_NAME
1015
- FROM INFORMATION_SCHEMA.STATISTICS
1016
- WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND NON_UNIQUE = 0 AND INDEX_NAME != 'PRIMARY'
1017
- ORDER BY INDEX_NAME, SEQ_IN_INDEX
1018
- '''
1019
- unique_map = {}
1020
- try:
1021
- with self._get_connection() as conn:
1022
- with conn.cursor() as cursor:
1023
- cursor.execute(sql, (db_name, table_name))
1024
- for row in cursor.fetchall():
1025
- idx = row['INDEX_NAME']
1026
- col = row['COLUMN_NAME']
1027
- unique_map.setdefault(idx, []).append(col)
1028
- except Exception as e:
1029
- logger.warning('获取UNIQUE KEY信息失败', {'库': db_name, '表': table_name, '错误': str(e)})
1030
- # 只返回列名组合,全部清洗小写
1031
- return [[self._normalize_col(c) for c in cols] for cols in unique_map.values() if cols]
1032
-
1033
- def _add_unique_key(self, db_name: str, table_name: str, unique_cols: List[str]):
1034
- """
1035
- 添加UNIQUE KEY
1036
- """
1037
- safe_cols = [self._normalize_col(col) for col in unique_cols]
1038
- unique_name = f"uniq_{'_'.join(safe_cols)}"
1039
- sql = f'ALTER TABLE `{db_name}`.`{table_name}` ADD UNIQUE KEY `{unique_name}` ({','.join(f'`{col}`' for col in safe_cols)})'
431
+ logger.debug('表已创建', {'database': db_name, 'table': table_name})
432
+
433
+ def get_partition_table_name(self, base_name: str, date_value: str, partition_by: str) -> str:
434
+ """获取分表名称"""
1040
435
  try:
1041
- with self._get_connection() as conn:
1042
- with conn.cursor() as cursor:
1043
- cursor.execute(sql)
1044
- conn.commit()
1045
- logger.debug('添加唯一约束列成功', {'库': db_name, '表': table_name, '列': unique_cols})
1046
- except Exception as e:
1047
- logger.warning('唯一约束列添加失败', {'库': db_name, '表': table_name, '列': unique_cols, '错误': str(e)})
1048
-
1049
- def _upload_to_table(
1050
- self,
1051
- db_name: str,
1052
- table_name: str,
1053
- data: List[Dict],
1054
- set_typ: Dict[str, str],
1055
- primary_keys: Optional[List[str]],
1056
- check_duplicate: bool,
1057
- duplicate_columns: Optional[List[str]],
1058
- allow_null: bool,
1059
- auto_create: bool,
1060
- date_column: Optional[str],
1061
- indexes: Optional[List[str]],
1062
- batch_id: Optional[str] = None,
1063
- update_on_duplicate: bool = False,
1064
- transaction_mode: str = "batch",
1065
- unique_keys: Optional[List[List[str]]] = None
1066
- ):
1067
- """实际执行表上传的方法"""
1068
- table_existed = self._check_table_exists(db_name, table_name)
1069
- if not table_existed:
1070
- if auto_create:
1071
- self._create_table(db_name, table_name, set_typ, primary_keys, date_column, indexes,
1072
- allow_null=allow_null, unique_keys=unique_keys)
1073
- else:
1074
- logger.error('数据表不存在', {
1075
- '库': db_name,
1076
- '表': table_name,
1077
- })
1078
- raise ValueError(f"数据表不存在: `{db_name}`.`{table_name}`")
1079
- if table_existed and unique_keys:
1080
- try:
1081
- exist_ukeys = self._get_existing_unique_keys(db_name, table_name)
1082
- exist_ukeys_norm = [sorted([c.lower() for c in uk]) for uk in exist_ukeys]
1083
- filtered_ukeys = [uk for uk in unique_keys if 1 <= len(uk) <= 20]
1084
- to_add = []
1085
- for uk in filtered_ukeys:
1086
- norm_uk = sorted([c.lower() for c in uk])
1087
- if norm_uk not in exist_ukeys_norm:
1088
- to_add.append(uk)
1089
- max_unique_keys = 10
1090
- if len(exist_ukeys) + len(to_add) > max_unique_keys:
1091
- logger.warning('unique_keys超限', {
1092
- '库': db_name,
1093
- '表': table_name,
1094
- '已存在': exist_ukeys,
1095
- '本次待添加': to_add,
1096
- '最大数量': max_unique_keys
1097
- })
1098
- to_add = to_add[:max_unique_keys - len(exist_ukeys)]
1099
- for uk in to_add:
1100
- self._add_unique_key(db_name, table_name, uk)
1101
- except Exception as e:
1102
- logger.warning('动态unique key处理异常', {'库': db_name, '表': table_name, '错误': str(e)})
1103
- table_columns = self._get_table_columns(db_name, table_name)
1104
- if not table_columns:
1105
- logger.error('获取列失败', {
1106
- '库': db_name,
1107
- '表': table_name,
1108
- '列': self._shorten_for_log(table_columns),
1109
- })
1110
- raise ValueError(f"获取列失败 `{db_name}`.`{table_name}`")
1111
- # 检查并自动添加缺失的列
1112
- missing_columns = [col for col in set_typ if col not in table_columns]
1113
- if missing_columns:
1114
- if not self.auto_creat_missing_cols:
1115
- logger.error('列不存在且不支持自动添加,请手动维护表结构,并补齐缺失列', {
1116
- '库': db_name,
1117
- '表': table_name,
1118
- '缺失列数': len(missing_columns),
1119
- '缺失列': missing_columns,
1120
- })
1121
- raise ValueError(f"列不存在: `{missing_columns}` -> `{db_name}`.`{table_name}`")
436
+ if isinstance(date_value, str):
437
+ date_obj = pd.to_datetime(date_value)
1122
438
  else:
1123
- # 表有缺失列时报错,建议不允许自动添加,手动检查数据一致性,以免产生不必要的表错误
1124
- # 自动添加缺失的列
1125
- for col in missing_columns:
1126
- try:
1127
- self._add_column_to_table(db_name, table_name, col, set_typ[col], allow_null)
1128
- logger.info('自动添加缺失列', {
1129
- '库': db_name,
1130
- '表': table_name,
1131
- '列': col,
1132
- '类型': set_typ[col]
1133
- })
1134
- except Exception as e:
1135
- logger.error('添加列失败', {
1136
- '库': db_name,
1137
- '表': table_name,
1138
- '列': col,
1139
- '类型': set_typ[col],
1140
- '错误': str(e)
1141
- })
1142
- raise ValueError(f"添加列失败: `{col}` -> `{db_name}`.`{table_name}`: {str(e)}")
1143
-
1144
- # 重新获取表列信息
1145
- table_columns = self._get_table_columns(db_name, table_name)
1146
- if date_column and date_column in table_columns:
1147
- try:
1148
- self._ensure_index(db_name, table_name, date_column)
1149
- except Exception as e:
1150
- logger.warning('分表参考字段索引创建失败', {'库': db_name, '表': table_name, '列': date_column, '错误': str(e)})
1151
- inserted, skipped, failed = self._insert_data(
1152
- db_name, table_name, data, set_typ,
1153
- check_duplicate, duplicate_columns,
1154
- batch_id=batch_id,
1155
- update_on_duplicate=update_on_duplicate,
1156
- transaction_mode=transaction_mode
1157
- )
1158
- return inserted, skipped, failed
1159
-
1160
- def _infer_data_type(self, value: Any, no_log: bool = False) -> str:
1161
- """
1162
- 根据值推断合适的MySQL数据类型
1163
-
1164
- :param value: 要推断的值
1165
- :param no_log: 记录日志,默认为False
1166
- :return: MySQL数据类型字符串
1167
- """
1168
- if value is None or str(value).lower() in ['', 'none', 'nan']:
1169
- return 'VARCHAR(255)' # 默认字符串类型
1170
-
1171
- # 检查是否是百分比字符串
1172
- if isinstance(value, str):
1173
- if '%' in value:
1174
- if re.match(r'^-?\d+(\.\d+)?%$', value.strip()):
1175
- return 'DECIMAL(10, 4)' # 百分比转为小数,使用DECIMAL
1176
- else:
1177
- return 'VARCHAR(255)' # 不符合格式的百分比,视为字符串
1178
-
1179
- if isinstance(value, bool):
1180
- return 'TINYINT(1)'
1181
- elif isinstance(value, int):
1182
- # if -128 <= value <= 127:
1183
- # return 'TINYINT'
1184
- # elif -32768 <= value <= 32767:
1185
- # return 'SMALLINT'
1186
- # elif -8388608 <= value <= 8388607:
1187
- # return 'MEDIUMINT'
1188
- if -2147483648 <= value <= 2147483647:
1189
- return 'INT'
1190
- else:
1191
- return 'BIGINT'
1192
- elif isinstance(value, float):
1193
- # 计算小数位数
1194
- num_str = str(value)
1195
- _, decimal_places = count_decimal_places(num_str)
1196
- return f'DECIMAL(20,{min(decimal_places, 6)})' # 限制最大6位小数
1197
- elif isinstance(value, (datetime.datetime, pd.Timestamp)):
1198
- return 'DATETIME'
1199
- elif isinstance(value, datetime.date):
1200
- return 'DATE'
1201
- elif isinstance(value, (list, dict)):
1202
- return 'JSON'
1203
- elif isinstance(value, str):
1204
- # 尝试判断是否是日期时间
1205
- try:
1206
- self._validate_datetime(value=value, date_type=False, no_log=no_log)
1207
- return 'DATETIME'
1208
- except ValueError:
1209
- pass
1210
-
1211
- # 根据字符串长度选择合适类型
1212
- length = len(value)
1213
- if length <= 255:
1214
- return 'VARCHAR(255)'
1215
- elif length <= 65535:
1216
- return 'TEXT'
1217
- elif length <= 16777215:
1218
- return 'MEDIUMTEXT'
1219
- else:
1220
- return 'LONGTEXT'
439
+ date_obj = date_value
1221
440
 
1222
- return 'VARCHAR(255)' # 默认字符串类型
1223
-
1224
- def normalize_column_names(self, data: Union[pd.DataFrame, List[Dict[str, Any]]]) -> Union[
1225
- pd.DataFrame, List[Dict[str, Any]]]:
1226
- """
1227
- 1. pandas:规范化列名
1228
- 2. 字典列表:规范化每个字典的键
1229
- """
1230
- if isinstance(data, pd.DataFrame):
1231
- if self.case_sensitive:
1232
- data.columns = [self._validate_identifier(col) for col in data.columns]
1233
- else:
1234
- data.columns = [self._validate_identifier(col).lower() for col in data.columns]
1235
- return data
1236
- elif isinstance(data, list):
1237
- if self.case_sensitive:
1238
- return [{self._validate_identifier(k): v for k, v in item.items()} for item in data]
441
+ if partition_by == 'year':
442
+ return f"{base_name}_{date_obj.year}"
443
+ elif partition_by == 'month':
444
+ return f"{base_name}_{date_obj.year}_{date_obj.month:02d}"
1239
445
  else:
1240
- return [{self._validate_identifier(k).lower(): v for k, v in item.items()} for item in data]
1241
- return data
1242
-
1243
- def _prepare_data(
1244
- self,
1245
- data: Union[Dict, List[Dict], pd.DataFrame],
1246
- set_typ: Dict[str, str],
1247
- allow_null: bool = False,
1248
- db_name: str = None,
1249
- table_name: str = None,
1250
- auto_timestamps: bool = False
1251
- ) -> Tuple[List[Dict], Dict[str, str]]:
1252
- """
1253
- 准备要上传的数据,验证并转换数据类型
1254
- 根据set_typ自动处理所有数据类型的列:补齐缺失的列并丢弃多余的列
1255
- """
1256
- # 处理自动时间戳功能
1257
- if auto_timestamps:
1258
- data, set_typ = self._process_auto_timestamps(data, set_typ, db_name, table_name)
446
+ raise ValueError("partition_by必须是'year'或'month'")
447
+ except Exception as e:
448
+ raise ValueError(f"无效的日期值: {date_value}, 错误: {str(e)}")
449
+
450
+ @staticmethod
451
+ def _sanitize_identifier(identifier: str) -> str:
452
+ """清理标识符"""
453
+ if not identifier or not isinstance(identifier, str):
454
+ raise ValueError(f"无效的标识符: {identifier}")
455
+
456
+ # 清理特殊字符
457
+ cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '_', identifier)
458
+ cleaned = re.sub(r'_+', '_', cleaned).strip('_')
459
+
460
+ if not cleaned:
461
+ raise ValueError(f"标识符清理后为空: {identifier}")
1259
462
 
1260
- # set_typ的键清洗
1261
- if not set_typ:
1262
- set_typ = {}
1263
- normalized_set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
463
+ # 检查MySQL关键字
464
+ mysql_keywords = {
465
+ 'select', 'insert', 'update', 'delete', 'from', 'where', 'and', 'or',
466
+ 'not', 'like', 'in', 'is', 'null', 'true', 'false', 'between'
467
+ }
1264
468
 
1265
- # 统一数据格式为字典列表
469
+ if len(cleaned) > 64:
470
+ cleaned = cleaned[:64]
471
+
472
+ if cleaned.lower() in mysql_keywords:
473
+ return f"`{cleaned}`"
474
+
475
+ return cleaned
476
+
477
+
478
+ class DataProcessor:
479
+ """数据处理器"""
480
+
481
+ @staticmethod
482
+ def normalize_data(data: Union[Dict, List[Dict], pd.DataFrame]) -> List[Dict]:
483
+ """标准化数据格式为字典列表"""
1266
484
  if isinstance(data, pd.DataFrame):
1267
- try:
1268
- if self.case_sensitive:
1269
- data.columns = [self._validate_identifier(col) for col in data.columns]
1270
- else:
1271
- data.columns = [self._validate_identifier(col).lower() for col in data.columns]
1272
- data = data.replace({pd.NA: None}).to_dict('records')
1273
- except Exception as e:
1274
- logger.error('DataFrame处理时发生错误', {
1275
- 'error': str(e),
1276
- 'data': self._shorten_for_log(data),
1277
- })
1278
- raise ValueError(f"DataFrame处理时发生错误: {e}")
485
+ return data.to_dict('records')
1279
486
  elif isinstance(data, dict):
1280
- if self.case_sensitive:
1281
- data = [{k: v for k, v in data.items()}]
1282
- else:
1283
- data = [{k.lower(): v for k, v in data.items()}]
487
+ return [data]
1284
488
  elif isinstance(data, list) and all(isinstance(item, dict) for item in data):
1285
- if self.case_sensitive:
1286
- data = [{k: v for k, v in item.items()} for item in data]
1287
- else:
1288
- data = [{k.lower(): v for k, v in item.items()} for item in data]
1289
- else:
1290
- logger.error('数据结构必须是字典、列表、字典列表或dataframe', {
1291
- 'data': self._shorten_for_log(data),
1292
- })
1293
- raise ValueError("数据结构必须是字典、列表、字典列表或dataframe")
1294
-
1295
- # 统一处理原始数据中列名的特殊字符
1296
- data = self.normalize_column_names(data)
1297
-
1298
- if not normalized_set_typ:
1299
- logger.warning('set_typ为空, 将自动推断数据类型, 可能存在数据类型识别错误')
1300
-
1301
- # 根据set_typ处理所有数据的列:严格按set_typ定义的列进行过滤
1302
- filtered_set_typ = {}
1303
- data_columns = list(data[0].keys()) if data and len(data) > 0 else []
1304
-
1305
- if normalized_set_typ:
1306
- # 严格按照set_typ定义的列进行过滤,排除id列
1307
- for col in normalized_set_typ:
1308
- if (self.case_sensitive and col == 'id') or (not self.case_sensitive and col.lower() == 'id'):
1309
- continue
1310
- filtered_set_typ[col] = normalized_set_typ[col]
1311
-
1312
- # 对所有数据行进行列处理:补齐缺失列,丢弃多余列
1313
- processed_data = []
1314
- for row in data:
1315
- processed_row = {}
1316
- # 只保留set_typ中定义的列
1317
- for col in filtered_set_typ:
1318
- if col in row:
1319
- processed_row[col] = row[col]
1320
- else:
1321
- processed_row[col] = None # 缺失列用None填充
1322
- processed_data.append(processed_row)
1323
- data = processed_data
1324
-
1325
- # 检查是否有丢弃的列
1326
- dropped_columns = [col for col in data_columns if col not in filtered_set_typ]
1327
- if dropped_columns:
1328
- logger.warning('数据中存在set_typ未定义的列并已被丢弃', {
1329
- '库': db_name,
1330
- '表': table_name,
1331
- '丢弃列': dropped_columns,
1332
- # '保留列': list(filtered_set_typ.keys())
1333
- })
1334
-
1335
- logger.debug('数据列处理完成', {
1336
- '库': db_name,
1337
- '表': table_name,
1338
- '原始列': data_columns,
1339
- '目标列': list(filtered_set_typ.keys()),
1340
- '丢弃列': dropped_columns
1341
- })
489
+ return data
1342
490
  else:
1343
- # 如果set_typ为空,则推断所有数据列的类型
1344
- for col in data_columns:
1345
- if col not in filtered_set_typ:
1346
- # 推断类型
1347
- sample_values = [row[col] for row in data if col in row and row[col] is not None][:5]
1348
- inferred_type = None
1349
- for val in sample_values:
1350
- inferred_type = self._infer_data_type(val, no_log=True)
1351
- if inferred_type:
1352
- break
1353
- if not inferred_type:
1354
- inferred_type = 'VARCHAR(255)'
1355
- filtered_set_typ[col] = inferred_type
1356
- logger.debug(f"自动推断列 `{col}` 的数据类型为: `{inferred_type}`")
1357
-
491
+ raise ValueError("数据格式必须是字典、字典列表或DataFrame")
492
+
493
+ @staticmethod
494
+ def prepare_data_for_insert(data: List[Dict], set_typ: Dict[str, str],
495
+ allow_null: bool = False) -> List[Dict]:
496
+ """准备插入数据"""
1358
497
  prepared_data = []
498
+
1359
499
  for row_idx, row in enumerate(data, 1):
1360
500
  prepared_row = {}
1361
- for col_name in filtered_set_typ:
1362
- # 跳过id列,不允许外部传入id
1363
- if (self.case_sensitive and col_name == 'id') or (not self.case_sensitive and col_name.lower() == 'id'):
1364
- continue
1365
- # 对于自动时间戳字段,跳过处理,让MySQL自动处理
1366
- col_type_lower = filtered_set_typ[col_name].lower()
1367
- is_auto_timestamp = ('timestamp' in col_type_lower and 'current_timestamp' in col_type_lower and
1368
- col_name in ['创建时间', '更新时间'])
1369
-
1370
- if is_auto_timestamp:
1371
- # 自动时间戳字段完全跳过,不在INSERT语句中包含
501
+
502
+ for col_name, col_type in set_typ.items():
503
+ # 跳过系统列(id, create_at, update_at由MySQL自动处理)
504
+ if col_name.lower() in ['id', 'create_at', 'update_at']:
1372
505
  continue
1373
506
 
1374
- if col_name not in row:
1375
- # 对于缺失的列,使用None作为默认值,在_validate_value中会根据allow_null和列类型进行进一步处理
1376
- try:
1377
- prepared_row[col_name] = self._validate_value(None, filtered_set_typ[col_name], allow_null, db_name, table_name, col_name)
1378
- except ValueError as e:
1379
- if not allow_null:
1380
- # 如果不允许空值但验证失败,尝试使用兜底值
1381
- try:
1382
- fallback_value = self._get_fallback_value(filtered_set_typ[col_name].lower(), allow_null, db_name, table_name, col_name, None)
1383
- if fallback_value is not None:
1384
- prepared_row[col_name] = fallback_value
1385
- logger.warning(f"行号:{row_idx} -> 缺失列: `{col_name}`, 使用兜底值: {fallback_value}", {'row': self._shorten_for_log(row)})
1386
- else:
1387
- error_msg = f"行号:{row_idx} -> 缺失列: `{col_name}`, 且不允许空值"
1388
- logger.error(error_msg, {'row': self._shorten_for_log(row)})
1389
- raise ValueError(error_msg)
1390
- except Exception:
1391
- error_msg = f"行号:{row_idx} -> 缺失列: `{col_name}`, 且不允许空值"
1392
- logger.error(error_msg, {'row': self._shorten_for_log(row)})
1393
- raise ValueError(error_msg)
1394
- else:
1395
- prepared_row[col_name] = None
1396
- else:
1397
- # 处理用户传入的值
1398
- try:
1399
- prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name], allow_null, db_name, table_name, col_name)
1400
- except ValueError as e:
1401
- # 如果数据验证失败,检查是否为空值且不允许空值,尝试使用兜底值
1402
- original_value = row[col_name]
1403
- is_empty_original = (original_value is None or
1404
- original_value == '' or
1405
- (not isinstance(original_value, (list, dict)) and
1406
- pd.isna(original_value) if hasattr(pd, 'isna') else False))
1407
-
1408
- if is_empty_original and not allow_null:
1409
- try:
1410
- fallback_value = self._get_fallback_value(filtered_set_typ[col_name].lower(), allow_null, db_name, table_name, col_name, original_value)
1411
- if fallback_value is not None:
1412
- prepared_row[col_name] = fallback_value
1413
- logger.warning(f"行:{row_idx}, 列:`{col_name}` -> 原值验证失败,使用兜底值: {fallback_value}", {
1414
- '原值': original_value,
1415
- '兜底值': fallback_value,
1416
- 'row': self._shorten_for_log(row)
1417
- })
1418
- else:
1419
- logger.error('数据验证失败', {
1420
- '列': col_name,
1421
- '行': row_idx,
1422
- '报错': str(e),
1423
- 'row': self._shorten_for_log(row),
1424
- })
1425
- raise ValueError(f"行:{row_idx}, 列:`{col_name}`-> 报错: {str(e)}")
1426
- except Exception:
1427
- logger.error('数据验证失败', {
1428
- '列': col_name,
1429
- '行': row_idx,
1430
- '报错': str(e),
1431
- 'row': self._shorten_for_log(row),
1432
- })
1433
- raise ValueError(f"行:{row_idx}, 列:`{col_name}`-> 报错: {str(e)}")
1434
- else:
1435
- logger.error('数据验证失败', {
1436
- '列': col_name,
1437
- '行': row_idx,
1438
- '报错': str(e),
1439
- 'row': self._shorten_for_log(row),
1440
- })
1441
- raise ValueError(f"行:{row_idx}, 列:`{col_name}`-> 报错: {str(e)}")
1442
- prepared_data.append(prepared_row)
1443
- return prepared_data, filtered_set_typ
1444
-
1445
- def upload_data(
1446
- self,
1447
- db_name: str,
1448
- table_name: str,
1449
- data: Union[Dict, List[Dict], pd.DataFrame],
1450
- set_typ: Dict[str, str],
1451
- primary_keys: Optional[List[str]] = None,
1452
- check_duplicate: bool = False,
1453
- duplicate_columns: Optional[List[str]] = None,
1454
- allow_null: bool = False,
1455
- partition_by: Optional[str] = None,
1456
- partition_date_column: str = '日期',
1457
- auto_create: bool = True,
1458
- indexes: Optional[List[str]] = None,
1459
- update_on_duplicate: bool = False,
1460
- transaction_mode: str = "batch",
1461
- unique_keys: Optional[List[List[str]]] = None,
1462
- auto_timestamps: bool = False
1463
- ):
1464
- """
1465
- 上传数据到数据库的主入口方法
1466
-
1467
- :param db_name: 数据库名
1468
- :param table_name: 表名
1469
- :param data: 要上传的数据,支持字典、字典列表或DataFrame格式
1470
- :param set_typ: 列名和数据类型字典 {列名: 数据类型}
1471
- :param primary_keys: 主键列列表,可选。格式:['col1', 'col2'] 或 None
1472
- :param check_duplicate: 是否检查重复数据,默认为False
1473
- :param duplicate_columns: 用于检查重复的列,可选。格式:['col1', 'col2'] 或 None
1474
- :param allow_null: 是否允许空值,默认为False
1475
- :param partition_by: 分表方式('year'、'month'、'None'),可选
1476
- :param partition_date_column: 用于分表的日期列名,默认为'日期', 默认会添加为索引
1477
- :param auto_create: 表不存在时是否自动创建,默认为True
1478
- :param indexes: 需要创建索引的列列表,可选。格式:['col1', 'col2'] 或 None
1479
- :param update_on_duplicate: 遇到重复数据时是否更新旧数据,默认为False
1480
- :param transaction_mode: 事务模式,可选值:
1481
- - 'row' : 逐行提交事务(错误隔离性好)
1482
- - 'batch' : 整批提交事务(性能最优)
1483
- - 'hybrid' : 混合模式(每N行提交,平衡性能与安全性)
1484
- :param unique_keys: 唯一约束列表,每个元素为列名列表,支持多列组合唯一约束。格式:[['col1', 'col2'], ['col3']] 或 None
1485
- :param auto_timestamps: 是否自动添加创建时间和更新时间列,默认为False。启用后会自动添加'创建时间'和'更新时间'两列
1486
- :raises: 可能抛出各种验证和数据库相关异常
1487
-
1488
- ---
1489
- 参数格式验证:
1490
-
1491
- - primary_keys: 必须是字符串列表或None,如 ['col1', 'col2']
1492
- - indexes: 必须是字符串列表或None,如 ['col1', 'col2']
1493
- - unique_keys: 必须是嵌套列表或None,如 [['col1', 'col2'], ['col3']]
1494
- - 错误示例:unique_keys=['col1', 'col2'] (应该是 [['col1', 'col2']])
1495
- - 所有列名不能为空字符串,会自动去除首尾空格
1496
- - 重复的列名会被自动去重
1497
-
1498
- 空值处理规则:
1499
- - None: 直接返回None,忽略此参数
1500
- - []: 空列表,返回None,忽略此参数
1501
- - [[]]: 包含空列表,跳过空列表,如果最终为空则返回None
1502
- - ['']: 包含空字符串,抛出异常(不允许空字符串)
1503
- - [' ']: 包含纯空白字符,抛出异常(不允许纯空白字符)
1504
- - ['', 'col1']: 混合空字符串和有效字符串,跳过空字符串,保留有效字符串
1505
-
1506
- ---
1507
- 关于 indexes 和 unique_keys 参数:
1508
-
1509
- - indexes 创建普通索引,unique_keys 创建唯一约束
1510
- - 如果同一列同时出现在 indexes 和 unique_keys 中,系统会优先创建唯一约束,跳过普通索引
1511
- - 唯一约束本身就具有索引功能,因此不会重复创建普通索引
1512
- - 建议:如果某列需要唯一性约束,直接使用 unique_keys 参数,无需在 indexes 中重复指定
1513
-
1514
- ---
1515
- unique_keys、check_duplicate、update_on_duplicate 三者组合下的行为总结:
1516
-
1517
- | unique_keys | check_duplicate | update_on_duplicate | 行为 |
1518
- |-------------|----------------|---------------------|------------------------------|
1519
- | 有/无 | False | False | 冲突时报错/跳过,不覆盖 |
1520
- | 有/无 | False | True | 冲突时覆盖(ON DUPLICATE KEY)|
1521
- | 有/无 | True | False | 主动查重,冲突时跳过,不覆盖 |
1522
- | 有/无 | True | True | 主动查重,冲突时覆盖 |
1523
-
1524
- - unique_keys 只决定唯一性,不决定是否覆盖。
1525
- - check_duplicate=True 时,插入前主动查重,重复数据跳过或覆盖,取决于 update_on_duplicate。
1526
- - update_on_duplicate=True 时,遇到唯一约束冲突会用新数据覆盖旧数据。
1527
- - 只要 update_on_duplicate=True 且表存在唯一约束(如 unique_keys),无论 check_duplicate 是否为 True,都会更新旧数据(即 ON DUPLICATE KEY UPDATE 生效)。
1528
- - 如需"覆盖"行为,务必设置 update_on_duplicate=True,不管 check_duplicate 是否为 True。
1529
- - 如需"跳过"行为,设置 update_on_duplicate=False 即可。
1530
-
1531
- ---
1532
- auto_timestamps 参数:
1533
-
1534
- - 当 auto_timestamps=True 时,系统会自动添加'创建时间'和'更新时间'两列
1535
- - 如果原始数据中已存在这两列,系统会先移除原始数据中的这些列,然后添加新的时间戳
1536
- - '创建时间':记录数据首次插入的时间,使用当前时间戳
1537
- - '更新时间':记录数据最后更新的时间,插入时与创建时间相同,更新时会自动更新为当前时间
1538
- - 时间戳列的数据类型为 DATETIME,格式为 'YYYY-MM-DD HH:MM:SS'
1539
- - 这两列会自动添加到 set_typ 中,无需手动指定
1540
- - 建议在需要审计数据变更历史的表中启用此功能
1541
- """
1542
- # upload_start = time.time()
1543
- # 检查data参数是否为None
1544
- if data is None:
1545
- logger.error('data参数不能为None', {
1546
- '库': db_name,
1547
- '表': table_name,
1548
- })
1549
- raise ValueError("data参数不能为None,请传入有效的数据")
1550
-
1551
- if isinstance(data, list) or (hasattr(data, 'shape') and hasattr(data, '__len__')):
1552
- initial_row_count = len(data)
1553
- else:
1554
- initial_row_count = 1
1555
-
1556
- batch_id = f"batch_{int(time.time() * 1000)}"
1557
- success_flag = False
1558
- dropped_rows = 0
1559
- total_inserted = 0
1560
- total_skipped = 0
1561
- total_failed = 0
1562
- validated_primary_keys = None
1563
- validated_indexes = None
1564
- validated_unique_keys = None
1565
- prepared_data = None
1566
- filtered_set_typ = None
1567
- inserted = None
1568
- skipped = None
1569
- failed = None
1570
-
1571
- try:
1572
- # 验证参数格式
1573
- validated_primary_keys = self._validate_primary_keys_format(primary_keys, db_name, table_name)
1574
- validated_indexes = self._validate_indexes_format(indexes, db_name, table_name)
1575
- validated_unique_keys = self._validate_unique_keys_format(unique_keys, db_name, table_name)
507
+ value = row.get(col_name)
508
+ try:
509
+ prepared_row[col_name] = DataValidator.validate_and_convert_value(
510
+ value, col_type, allow_null
511
+ )
512
+ except ValueError as e:
513
+ logger.error('数据验证失败', {
514
+ '行号': row_idx,
515
+ '列名': col_name,
516
+ '原始值': value,
517
+ '错误': str(e)
518
+ })
519
+ raise ValueError(f"行{row_idx}列{col_name}验证失败: {str(e)}")
1576
520
 
1577
- logger.debug("开始上传", {
1578
- '库': db_name,
1579
- '表': table_name,
1580
- '批次': batch_id,
1581
- '传入': len(data) if hasattr(data, '__len__') else 1,
1582
- '参数': {
1583
- '主键': validated_primary_keys,
1584
- '去重': check_duplicate,
1585
- '去重列': duplicate_columns,
1586
- '允许空值': allow_null,
1587
- '分表方式': partition_by,
1588
- '分表列': partition_date_column,
1589
- # '自动建表': auto_create,
1590
- '索引': validated_indexes,
1591
- '更新旧数据': update_on_duplicate,
1592
- '事务模式': transaction_mode,
1593
- '唯一约束': validated_unique_keys
1594
- },
1595
- # '数据样例': self._shorten_for_log(data, 2)
1596
- })
521
+ prepared_data.append(prepared_row)
522
+
523
+ return prepared_data
524
+
525
+ @staticmethod
526
+ def partition_data_by_date(data: List[Dict], date_column: str,
527
+ partition_by: str) -> Dict[str, List[Dict]]:
528
+ """按日期分区数据"""
529
+ partitioned = {}
530
+ table_manager = TableManager(None, None) # 只用静态方法
531
+
532
+ for row in data:
533
+ if date_column not in row:
534
+ logger.warning('缺少分区日期列', {'列名': date_column, '行数据': row})
535
+ continue
1597
536
 
1598
- # 验证分表参数
1599
- if partition_by:
1600
- partition_by = str(partition_by).lower()
1601
- if partition_by not in ['year', 'month']:
1602
- logger.error('分表方式必须是 "year" 或 "month" 或 "None', {
1603
- '库': db_name,
1604
- '表': table_name,
1605
- '批次': batch_id,
1606
- '分表方式': partition_by,
1607
- })
1608
- raise ValueError("分表方式必须是 'year' 或 'month' 或 'None'")
1609
-
1610
- # 准备数据
1611
- prepared_data, filtered_set_typ = self._prepare_data(data, set_typ, allow_null, db_name, table_name, auto_timestamps)
1612
-
1613
- # 检查数据库是否存在
1614
- if not self._check_database_exists(db_name):
1615
- if auto_create:
1616
- self._create_database(db_name)
1617
- else:
1618
- logger.error('数据库不存在', {
1619
- '库': db_name,
1620
- })
1621
- raise ValueError(f"数据库不存在: `{db_name}`")
1622
-
1623
- # 处理分表逻辑
1624
- if partition_by:
1625
- partitioned_data = {}
1626
- for row in prepared_data:
1627
- try:
1628
- if partition_date_column not in row:
1629
- logger.error('异常缺失列',{
1630
- '库': db_name,
1631
- '表': table_name,
1632
- '批次': batch_id,
1633
- '缺失列': partition_date_column,
1634
- 'row': self._shorten_for_log(row),
1635
- })
1636
- dropped_rows += 1
1637
- continue
1638
- part_table = self._get_partition_table_name(
1639
- table_name,
1640
- str(row[partition_date_column]),
1641
- partition_by
1642
- )
1643
- if part_table not in partitioned_data:
1644
- partitioned_data[part_table] = []
1645
- partitioned_data[part_table].append(row)
1646
- except Exception as e:
1647
- logger.error('分表处理异常', {
1648
- '库': db_name,
1649
- '表': table_name,
1650
- 'row_data': self._shorten_for_log(row),
1651
- 'error': str(e),
1652
- })
1653
- dropped_rows += 1
1654
- continue
1655
-
1656
- # 对每个分表执行上传
1657
- total_inserted = 0
1658
- total_skipped = dropped_rows # 分表异常丢弃
1659
- total_failed = 0
1660
- for part_table, part_data in partitioned_data.items():
1661
- try:
1662
- inserted, skipped, failed = self._upload_to_table(
1663
- db_name, part_table, part_data, filtered_set_typ,
1664
- validated_primary_keys, check_duplicate, duplicate_columns,
1665
- allow_null, auto_create, partition_date_column,
1666
- validated_indexes, batch_id, update_on_duplicate, transaction_mode,
1667
- validated_unique_keys
1668
- )
1669
- total_inserted += inserted
1670
- total_skipped += skipped
1671
- total_failed += failed
1672
- if partition_date_column in filtered_set_typ:
1673
- try:
1674
- self._ensure_index(db_name, part_table, partition_date_column)
1675
- except Exception as e:
1676
- logger.warning('分表参考字段索引创建失败', {'库': db_name, '表': part_table, '列': partition_date_column, '错误': str(e)})
1677
- except Exception as e:
1678
- logger.error('分表上传异常', {
1679
- '库': db_name,
1680
- '表': table_name,
1681
- '分表': part_table,
1682
- 'error': str(e),
1683
- '数据样例': self._shorten_for_log(part_data, 2),
1684
- })
1685
- continue # 跳过当前分表,继续处理其他分表
1686
- else:
1687
- # 不分表,直接上传
1688
- inserted, skipped, failed = self._upload_to_table(
1689
- db_name, table_name, prepared_data, filtered_set_typ,
1690
- validated_primary_keys, check_duplicate, duplicate_columns,
1691
- allow_null, auto_create, partition_date_column,
1692
- validated_indexes, batch_id, update_on_duplicate, transaction_mode,
1693
- validated_unique_keys
1694
- )
1695
- total_inserted = inserted
1696
- total_skipped = skipped
1697
- total_failed = failed
1698
- if partition_date_column in filtered_set_typ:
1699
- try:
1700
- self._ensure_index(db_name, table_name, partition_date_column)
1701
- except Exception as e:
1702
- logger.warning('分表参考字段索引创建失败', {'库': db_name, '表': table_name, '列': partition_date_column, '错误': str(e)})
1703
-
1704
- success_flag = True
1705
-
1706
- except Exception as e:
1707
- logger.error('上传过程发生全局错误', {
1708
- '库': db_name,
1709
- '表': table_name,
1710
- 'error': str(e),
1711
- 'error_type': type(e).__name__,
1712
- '数据样例': self._shorten_for_log(data, 2),
1713
- })
1714
- return False
1715
- finally:
1716
- logger.info("存储完成", {
1717
- '库': db_name,
1718
- '表': table_name,
1719
- '批次': batch_id,
1720
- 'finish': success_flag,
1721
- '数据行': initial_row_count,
1722
- '插入': total_inserted,
1723
- '跳过': total_skipped,
1724
- '失败': total_failed
1725
- })
1726
-
1727
- # 更新索引(只有在成功时才执行)
1728
- if success_flag and validated_indexes:
1729
537
  try:
1730
- self._update_indexes(db_name, table_name, validated_indexes)
538
+ partition_suffix = table_manager.get_partition_table_name(
539
+ '', row[date_column], partition_by
540
+ ).split('_', 1)[1] # 获取后缀部分
541
+
542
+ if partition_suffix not in partitioned:
543
+ partitioned[partition_suffix] = []
544
+ partitioned[partition_suffix].append(row)
1731
545
  except Exception as e:
1732
- logger.warning('更新索引时发生错误', {
1733
- '库': db_name,
1734
- '表': table_name,
1735
- '错误': str(e)
1736
- })
1737
- return True
546
+ logger.error('分区处理失败', {'行数据': row, '错误': str(e)})
547
+ continue
548
+
549
+ return partitioned
1738
550
 
1739
- @_execute_with_retry
1740
- def _insert_data(
1741
- self,
1742
- db_name: str,
1743
- table_name: str,
1744
- data: List[Dict],
1745
- set_typ: Dict[str, str],
1746
- check_duplicate: bool,
1747
- duplicate_columns: Optional[List[str]],
1748
- batch_id: Optional[str] = None,
1749
- update_on_duplicate: bool = False,
1750
- transaction_mode: str = "batch"
1751
- ):
1752
- """
1753
- 实际执行数据插入的方法
1754
551
 
1755
- :param db_name: 数据库名
1756
- :param table_name: 表名
1757
- :param data: 要插入的数据列表
1758
- :param set_typ: 列名和数据类型字典 {列名: 数据类型}
1759
- :param check_duplicate: 是否检查重复数据
1760
- :param duplicate_columns: 用于检查重复的列,可选
1761
- :param batch_id: 批次ID用于日志追踪,可选
1762
- :param update_on_duplicate: 遇到重复数据时是否更新旧数据,默认为False
1763
- :param transaction_mode: 事务模式,可选值:
1764
- - 'row' : 逐行提交事务(错误隔离性好)
1765
- - 'batch' : 整批提交事务(性能最优)
1766
- - 'hybrid' : 混合模式(每N行提交,平衡性能与安全性)
1767
- """
552
+ class DataInserter:
553
+ """数据插入器"""
554
+
555
+ def __init__(self, connection_manager: DatabaseConnectionManager):
556
+ self.conn_mgr = connection_manager
557
+
558
+ def insert_data(self, db_name: str, table_name: str, data: List[Dict],
559
+ set_typ: Dict[str, str], update_on_duplicate: bool = False) -> Tuple[int, int, int]:
560
+ """插入数据"""
1768
561
  if not data:
1769
562
  return 0, 0, 0
1770
- transaction_mode = self._validate_transaction_mode(transaction_mode)
1771
- sql = self._prepare_insert_sql(
1772
- db_name, table_name, set_typ,
1773
- check_duplicate, duplicate_columns,
1774
- update_on_duplicate
1775
- )
1776
- total_inserted, total_skipped, total_failed = self._execute_batch_insert(
1777
- db_name, table_name, data, set_typ,
1778
- sql, check_duplicate, duplicate_columns,
1779
- batch_id, transaction_mode,
1780
- update_on_duplicate
1781
- )
1782
- logger.debug('插入完成', {
1783
- '库': db_name,
1784
- '表': table_name,
1785
- '总计': len(data),
1786
- '插入': total_inserted,
1787
- '跳过': total_skipped,
1788
- '失败': total_failed,
1789
- '事务模式': transaction_mode,
1790
- })
1791
- return total_inserted, total_skipped, total_failed
1792
-
1793
- def _validate_transaction_mode(self, mode: str) -> str:
1794
- """验证并标准化事务模式"""
1795
- valid_modes = ('row', 'batch', 'hybrid')
1796
- if mode.lower() not in valid_modes:
1797
- logger.error('事务模式参数错误', {
1798
- '错误值': mode,
1799
- '可选值': valid_modes,
1800
- '自动使用默认模式': 'batch',
1801
- })
1802
- return 'batch'
1803
- return mode.lower()
1804
-
1805
- def _build_simple_insert_sql(self, db_name, table_name, columns, update_on_duplicate):
1806
- safe_columns = [self._validate_identifier(col) for col in columns]
1807
- placeholders = ','.join(['%s'] * len(safe_columns))
1808
-
563
+
564
+ # 准备SQL语句(排除系统列)
565
+ columns = [col for col in set_typ.keys() if col.lower() not in ['id', 'create_at', 'update_at']]
566
+ safe_columns = [TableManager._sanitize_identifier(col) for col in columns]
567
+ placeholders = ','.join(['%s'] * len(columns))
568
+
1809
569
  sql = f"""
1810
- INSERT INTO `{db_name}`.`{table_name}`
1811
- (`{'`,`'.join(safe_columns)}`)
1812
- VALUES ({placeholders})
570
+ INSERT INTO `{db_name}`.`{table_name}`
571
+ (`{'`,`'.join(safe_columns)}`)
572
+ VALUES ({placeholders})
1813
573
  """
1814
-
1815
- # 情况2:不检查重复但允许更新
574
+
1816
575
  if update_on_duplicate:
1817
- update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)"
1818
- for col in columns])
576
+ # 更新时只更新业务列,不更新create_at,update_at会自动更新
577
+ update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)" for col in safe_columns])
1819
578
  sql += f" ON DUPLICATE KEY UPDATE {update_clause}"
1820
-
1821
- return sql
1822
-
1823
- def _build_duplicate_check_sql(self, db_name, table_name, all_columns,
1824
- duplicate_columns, update_on_duplicate, set_typ):
1825
- if duplicate_columns is None:
1826
- duplicate_columns = []
1827
- duplicate_columns = [_item for _item in duplicate_columns if _item.lower() not in self.base_excute_col]
1828
- safe_columns = [self._validate_identifier(col) for col in all_columns]
1829
- placeholders = ','.join(['%s'] * len(safe_columns))
1830
-
1831
- # 确定排重列(排除id和更新时间列)
1832
- dup_cols = duplicate_columns if duplicate_columns else all_columns
1833
-
1834
- # 构建排重条件
1835
- conditions = []
1836
- for col in dup_cols:
1837
- col_type = set_typ.get(col, '').lower()
1838
- if col_type.startswith('decimal'):
1839
- _, scale = self._get_decimal_scale(col_type)
1840
- conditions.append(f"ROUND(`{col}`, {scale}) = ROUND(%s, {scale})")
1841
- else:
1842
- conditions.append(f"`{col}` = %s")
1843
-
1844
- # 情况3/5:允许更新
1845
- if update_on_duplicate:
1846
- update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)"
1847
- for col in all_columns])
1848
- sql = f"""
1849
- INSERT INTO `{db_name}`.`{table_name}`
1850
- (`{'`,`'.join(safe_columns)}`)
1851
- VALUES ({placeholders})
1852
- ON DUPLICATE KEY UPDATE {update_clause}
1853
- """
1854
- else:
1855
- # 情况4/6:不允许更新
1856
- sql = f"""
1857
- INSERT INTO `{db_name}`.`{table_name}`
1858
- (`{'`,`'.join(safe_columns)}`)
1859
- SELECT {placeholders}
1860
- FROM DUAL
1861
- WHERE NOT EXISTS (
1862
- SELECT 1 FROM `{db_name}`.`{table_name}`
1863
- WHERE {' AND '.join(conditions)}
1864
- )
1865
- """
1866
- return sql
1867
-
1868
- def _get_decimal_scale(self, decimal_type: str) -> Tuple[int, int]:
1869
- """从DECIMAL类型字符串中提取精度和标度"""
1870
- match = re.search(r'\((\d+)\s*,\s*(\d+)\)', decimal_type)
1871
- if match:
1872
- return int(match.group(1)), int(match.group(2))
1873
- return 18, 2 # 默认值
1874
-
1875
- def _prepare_insert_sql(
1876
- self,
1877
- db_name: str,
1878
- table_name: str,
1879
- set_typ: Dict[str, str],
1880
- check_duplicate: bool,
1881
- duplicate_columns: Optional[List[str]],
1882
- update_on_duplicate: bool
1883
- ) -> str:
1884
- """
1885
- 准备插入SQL语句, 增加StatementCache缓存
1886
- """
1887
- cache_key = (db_name, table_name, tuple(sorted(set_typ.items())), check_duplicate, tuple(duplicate_columns) if duplicate_columns else (), update_on_duplicate)
1888
- cached = self._prepared_statements.get(cache_key)
1889
- if cached:
1890
- return cached
1891
- # 获取所有列名(排除id和自动时间戳字段)
1892
- all_columns = []
1893
- for col in set_typ.keys():
1894
- if col.lower() == 'id':
1895
- continue
1896
- # 检查是否是自动时间戳字段
1897
- col_type_lower = set_typ[col].lower()
1898
- is_auto_timestamp = ('timestamp' in col_type_lower and 'current_timestamp' in col_type_lower and
1899
- col in ['创建时间', '更新时间'])
1900
- if not is_auto_timestamp:
1901
- all_columns.append(col)
1902
- if not check_duplicate:
1903
- sql = self._build_simple_insert_sql(db_name, table_name, all_columns,
1904
- update_on_duplicate)
1905
- else:
1906
- dup_cols = duplicate_columns if duplicate_columns else [
1907
- col for col in all_columns
1908
- if col.lower() not in self.base_excute_col
1909
- ]
1910
- sql = self._build_duplicate_check_sql(db_name, table_name, all_columns,
1911
- dup_cols, update_on_duplicate, set_typ)
1912
- self._prepared_statements[cache_key] = sql
1913
- return sql
1914
-
1915
- def _execute_batch_insert(
1916
- self,
1917
- db_name: str,
1918
- table_name: str,
1919
- data: List[Dict],
1920
- set_typ: Dict[str, str],
1921
- sql: str,
1922
- check_duplicate: bool,
1923
- duplicate_columns: Optional[List[str]],
1924
- batch_id: Optional[str],
1925
- transaction_mode: str,
1926
- update_on_duplicate: bool = False
1927
- ) -> Tuple[int, int, int]:
1928
- """
1929
- 执行批量插入操作,优化batch和hybrid模式。
1930
-
1931
- - batch模式下,使用executemany批量插入(如SQL带ON DUPLICATE KEY UPDATE时),MySQL会对每一行单独判断唯一约束:
1932
- - 不冲突的行会被正常插入。
1933
- - 冲突的行会触发ON DUPLICATE KEY UPDATE,用新数据更新旧数据。
1934
- - 不会因为一行冲突导致整批失败或回滚。
1935
- - 只有遇到严重的数据库错误(如所有行都因唯一约束冲突且没有ON DUPLICATE KEY UPDATE),才会整体回滚。
1936
- - 返回值为(插入行数, 跳过行数, 失败行数)。
1937
- """
1938
- def get_optimal_batch_size(total_rows: int) -> int:
1939
- if total_rows <= 100:
1940
- return total_rows
1941
- elif total_rows <= 1000:
1942
- return 500
1943
- elif total_rows <= 10000:
1944
- return 1000
1945
- else:
1946
- return 2000
1947
-
1948
- def ensure_basic_type(value):
1949
- """确保值是基本数据类型,如果是字典或列表则转换为字符串"""
1950
- if isinstance(value, (dict, list)):
1951
- try:
1952
- return json.dumps(value, ensure_ascii=False)
1953
- except (TypeError, ValueError):
1954
- return str(value)
1955
- return value
1956
579
 
1957
-
1958
- batch_size = get_optimal_batch_size(len(data))
1959
- # 排除id列和自动时间戳列
1960
- all_columns = []
1961
- for col in set_typ.keys():
1962
- if col.lower() == 'id':
1963
- continue
1964
- # 检查是否是自动时间戳字段
1965
- col_type_lower = set_typ[col].lower()
1966
- is_auto_timestamp = ('timestamp' in col_type_lower and 'current_timestamp' in col_type_lower and
1967
- col in ['创建时间', '更新时间'])
1968
- if not is_auto_timestamp:
1969
- all_columns.append(col)
580
+ # 批量插入
581
+ return self._execute_batch_insert(sql, data, columns)
582
+
583
+ def _execute_batch_insert(self, sql: str, data: List[Dict],
584
+ columns: List[str]) -> Tuple[int, int, int]:
585
+ """执行批量插入"""
586
+ batch_size = min(1000, len(data))
1970
587
  total_inserted = 0
1971
588
  total_skipped = 0
1972
589
  total_failed = 0
1973
- with self._get_connection() as conn:
590
+
591
+ with self.conn_mgr.get_connection() as conn:
1974
592
  with conn.cursor() as cursor:
1975
- if transaction_mode == 'batch':
1976
- for i in range(0, len(data), batch_size):
1977
- batch = data[i:i + batch_size]
1978
- # 使用批量插入逻辑
1979
- values_list = []
1980
- for row in batch:
1981
- values = [ensure_basic_type(row.get(col)) for col in all_columns]
1982
- if check_duplicate and not update_on_duplicate:
1983
- dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
1984
- values += [ensure_basic_type(row.get(col)) for col in dup_cols]
1985
- values_list.append(values)
1986
- try:
1987
- cursor.executemany(sql, values_list)
1988
- conn.commit()
1989
- # 在batch模式下,affected_rows表示实际影响的行数
1990
- # 如果update_on_duplicate为True,则affected_rows包含更新的行数
1991
- # 如果update_on_duplicate为False,则affected_rows只包含插入的行数
1992
- affected = cursor.rowcount if cursor.rowcount is not None else 0
1993
- if update_on_duplicate:
1994
- # 当启用更新时,affected_rows包含插入和更新的行数
1995
- # 我们需要区分插入和更新的行数
1996
- # 由于无法准确区分,我们假设所有行都是插入的
1997
- total_inserted += len(batch)
1998
- else:
1999
- # 当不启用更新时,affected_rows只包含插入的行数
2000
- total_inserted += affected
2001
- total_skipped += len(batch) - affected
2002
- except pymysql.err.IntegrityError as e:
2003
- conn.rollback()
2004
- # 在唯一约束冲突时,所有行都被跳过
2005
- total_skipped += len(batch)
2006
- logger.debug('批量插入唯一约束冲突,全部跳过', {'库': db_name, '表': table_name, '错误': str(e)})
2007
- except Exception as e:
2008
- conn.rollback()
2009
- total_failed += len(batch)
2010
- logger.error('批量插入失败', {'库': db_name, '表': table_name, '错误': str(e)})
2011
- elif transaction_mode == 'hybrid':
2012
- hybrid_n = 100 # 可配置
2013
- for i in range(0, len(data), hybrid_n):
2014
- batch = data[i:i + hybrid_n]
2015
- for row in batch:
2016
- try:
2017
- values = [ensure_basic_type(row.get(col)) for col in all_columns]
2018
- if check_duplicate and not update_on_duplicate:
2019
- dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
2020
- values += [ensure_basic_type(row.get(col)) for col in dup_cols]
2021
- cursor.execute(sql, values)
2022
- affected = cursor.rowcount if cursor.rowcount is not None else 0
2023
- if update_on_duplicate:
2024
- # 当启用更新时,affected_rows包含插入和更新的行数
2025
- # 假设所有行都是插入的,因为无法区分插入和更新
2026
- total_inserted += 1
2027
- else:
2028
- # 当不启用更新时,affected_rows只包含插入的行数
2029
- if affected > 0:
2030
- total_inserted += 1
2031
- else:
2032
- total_skipped += 1
2033
- except pymysql.err.IntegrityError as e:
2034
- conn.rollback()
2035
- total_skipped += 1
2036
- logger.debug('hybrid单行插入唯一约束冲突,跳过', {'库': db_name, '表': table_name, '错误': str(e)})
2037
- except Exception as e:
2038
- conn.rollback()
2039
- total_failed += 1
2040
- logger.error('hybrid单行插入失败', {'库': db_name, '表': table_name, '错误': str(e)})
593
+ for i in range(0, len(data), batch_size):
594
+ batch = data[i:i + batch_size]
595
+ values_list = []
596
+
597
+ for row in batch:
598
+ values = [self._ensure_basic_type(row.get(col)) for col in columns]
599
+ values_list.append(values)
600
+
601
+ try:
602
+ cursor.executemany(sql, values_list)
2041
603
  conn.commit()
2042
- else: # row模式
2043
- for row in data:
2044
- try:
2045
- values = [ensure_basic_type(row.get(col)) for col in all_columns]
2046
- if check_duplicate and not update_on_duplicate:
2047
- dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
2048
- values += [ensure_basic_type(row.get(col)) for col in dup_cols]
2049
- cursor.execute(sql, values)
2050
- affected = cursor.rowcount if cursor.rowcount is not None else 0
2051
- if update_on_duplicate:
2052
- # 当启用更新时,affected_rows包含插入和更新的行数
2053
- # 假设所有行都是插入的,因为无法区分插入和更新
2054
- total_inserted += 1
2055
- else:
2056
- # 当不启用更新时,affected_rows只包含插入的行数
2057
- if affected > 0:
2058
- total_inserted += 1
2059
- else:
2060
- total_skipped += 1
2061
- conn.commit()
2062
- except pymysql.err.IntegrityError as e:
2063
- conn.rollback()
2064
- total_skipped += 1
2065
- logger.debug('单行插入唯一约束冲突,跳过', {'库': db_name, '表': table_name, '错误': str(e)})
2066
- except Exception as e:
2067
- conn.rollback()
2068
- total_failed += 1
2069
- logger.error('单行插入失败', {'库': db_name, '表': table_name, '错误': str(e)})
604
+ affected = cursor.rowcount if cursor.rowcount is not None else len(batch)
605
+ total_inserted += affected
606
+ except pymysql.err.IntegrityError:
607
+ conn.rollback()
608
+ total_skipped += len(batch)
609
+ logger.debug('批量插入唯一约束冲突,跳过', {'批次大小': len(batch)})
610
+ except Exception as e:
611
+ conn.rollback()
612
+ total_failed += len(batch)
613
+ logger.error('批量插入失败', {'错误': str(e), '批次大小': len(batch)})
614
+
2070
615
  return total_inserted, total_skipped, total_failed
2071
-
2072
- def _check_pool_health(self) -> bool:
2073
- """
2074
- 检查连接池健康状态,防止连接泄露
2075
- """
2076
- conn = None
2077
- try:
2078
- if not hasattr(self, 'pool') or self.pool is None:
2079
- return False
2080
- conn = self.pool.connection()
2081
- conn.ping(reconnect=True)
2082
- logger.debug('连接池健康检查通过')
2083
- return True
2084
- except Exception as e:
2085
- logger.warning('连接池健康检查失败', {'error': str(e)})
2086
- return False
2087
- finally:
2088
- if conn is not None:
2089
- try:
2090
- conn.close()
2091
- except Exception as e:
2092
- logger.warning('关闭连接时出错', {'error': str(e)})
2093
-
616
+
2094
617
  @staticmethod
2095
- def retry_on_failure(max_retries: int = 3, delay: int = 1):
2096
- """
2097
- 通用重试装饰器
2098
- :param max_retries: 最大重试次数
2099
- :param delay: 重试间隔(秒)
2100
- :return: 装饰器
2101
- """
2102
- def decorator(func):
2103
- @wraps(func)
2104
- def wrapper(*args, **kwargs):
2105
- last_exception = None
2106
- for attempt in range(max_retries):
2107
- try:
2108
- return func(*args, **kwargs)
2109
- except (pymysql.OperationalError, pymysql.InterfaceError) as e:
2110
- last_exception = e
2111
- logger.warning('操作失败,准备重试', {'attempt': attempt + 1, 'error': str(e)})
2112
- if attempt < max_retries - 1:
2113
- time.sleep(delay * (attempt + 1))
2114
- continue
2115
- logger.error(f'操作重试 {max_retries} 次后失败', {'error': str(e)})
2116
- raise
2117
- except Exception as e:
2118
- logger.error('操作失败', {'error': str(e)})
2119
- raise
2120
- raise last_exception if last_exception else logger.error('操作重试失败,未知错误')
2121
- return wrapper
2122
- return decorator
2123
-
2124
- def _shorten_for_log(self, obj: Any, maxlen: int = 200) -> Any:
2125
- """
2126
- 日志安全截断工具:对字符串、列表、字典等做长度限制,避免日志过长。
2127
- :param obj: 原始对象
2128
- :param maxlen: 最大长度/元素数
2129
- :return: 截断后的对象
2130
- """
2131
- if isinstance(obj, str):
2132
- return obj[:maxlen] + ("..." if len(obj) > maxlen else "")
2133
- elif isinstance(obj, list):
2134
- return obj[:maxlen] + (["..."] if len(obj) > maxlen else [])
2135
- elif isinstance(obj, dict):
2136
- short = {k: self._shorten_for_log(v, maxlen) for i, (k, v) in enumerate(obj.items()) if i < maxlen}
2137
- if len(obj) > maxlen:
2138
- short['...'] = f"total_keys={len(obj)}"
2139
- return short
2140
- elif hasattr(obj, 'shape') and hasattr(obj, 'head'):
2141
- # pandas DataFrame
2142
- return f"DataFrame shape={obj.shape}, head={obj.head(1).to_dict()}"
2143
- return obj
2144
-
2145
- def _normalize_col(self, col: str) -> str:
2146
- """
2147
- 列名自动清洗并转小写(如case_sensitive为False),保证和表结构一致。
2148
- """
2149
- safe = self._validate_identifier(col)
2150
- return safe if self.case_sensitive else safe.lower()
2151
-
2152
- def _update_indexes(self, db_name: str, table_name: str, indexes: Optional[List[str]]):
2153
- """
2154
- 更新索引,避免重复添加或更新,同时注意大小写一致性。
2155
- 注意:如果列已经在unique_keys中定义,则不会重复创建普通索引。
2156
-
2157
- :param db_name: 数据库名
2158
- :param table_name: 表名
2159
- :param indexes: 需要更新的索引列列表
2160
- """
2161
- if not indexes:
2162
- return
2163
-
2164
- # 规范化索引列名
2165
- normalized_indexes = [self._normalize_col(idx) for idx in indexes]
2166
-
2167
- # 获取现有索引(包括普通索引和唯一约束)
2168
- try:
2169
- existing_indexes = self._get_existing_indexes(db_name, table_name)
2170
- except Exception as e:
2171
- logger.error('获取现有索引时发生错误', {'库': db_name, '表': table_name, '错误': str(e)})
2172
- raise
2173
-
2174
- # 获取表中现有的列名
2175
- try:
2176
- existing_columns = self._get_table_columns(db_name, table_name)
2177
- except Exception as e:
2178
- logger.error('获取现有列时发生错误', {'库': db_name, '表': table_name, '错误': str(e)})
2179
- raise
2180
-
2181
- # 找出需要添加的索引(排除已存在的索引和不在表中的列)
2182
- indexes_to_add = []
2183
- for idx in normalized_indexes:
2184
- if idx not in existing_indexes and idx in existing_columns:
2185
- indexes_to_add.append(idx)
2186
- elif idx in existing_indexes:
2187
- logger.debug('索引已存在,跳过', {'库': db_name, '表': table_name, '列': idx})
2188
- elif idx not in existing_columns:
2189
- logger.warning('索引列不存在于表中,跳过', {'库': db_name, '表': table_name, '列': idx})
2190
-
2191
- # 添加新索引
2192
- for idx in indexes_to_add:
618
+ def _ensure_basic_type(value):
619
+ """确保值是基本数据类型"""
620
+ if isinstance(value, (dict, list)):
2193
621
  try:
2194
- self._add_index(db_name, table_name, idx)
2195
- except Exception as e:
2196
- logger.error('添加索引时发生错误', {'库': db_name, '表': table_name, '列': idx, '错误': str(e)})
2197
- raise
2198
-
2199
- def _get_existing_indexes(self, db_name: str, table_name: str) -> Set[str]:
2200
- """
2201
- 获取表中现有的索引列名(包括普通索引和唯一约束)。
2202
-
2203
- :param db_name: 数据库名
2204
- :param table_name: 表名
2205
- :return: 现有索引列名的集合
2206
- """
2207
- sql = """
2208
- SELECT COLUMN_NAME
2209
- FROM INFORMATION_SCHEMA.STATISTICS
2210
- WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
2211
- """
2212
- existing_indexes = set()
2213
- try:
2214
- with self._get_connection() as conn:
2215
- with conn.cursor() as cursor:
2216
- cursor.execute(sql, (db_name, table_name))
2217
- existing_indexes = {row['COLUMN_NAME'] for row in cursor.fetchall()}
2218
- except Exception as e:
2219
- logger.error('获取现有索引失败', {'库': db_name, '表': table_name, '错误': str(e)})
2220
- raise
2221
- return existing_indexes
2222
-
2223
- def _add_index(self, db_name: str, table_name: str, column: str):
2224
- """
2225
- 添加索引到指定列。
2226
-
2227
- :param db_name: 数据库名
2228
- :param table_name: 表名
2229
- :param column: 需要添加索引的列名
2230
- """
2231
- sql = f'ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{column}` (`{column}`)'
2232
- try:
2233
- with self._get_connection() as conn:
2234
- with conn.cursor() as cursor:
2235
- cursor.execute(sql)
2236
- conn.commit()
2237
- logger.debug('已为列创建索引', {'库': db_name, '表': table_name, '列': column})
2238
- except Exception as e:
2239
- logger.error('创建索引失败', {'库': db_name, '表': table_name, '列': column, '错误': str(e)})
2240
- raise
622
+ return json.dumps(value, ensure_ascii=False)
623
+ except (TypeError, ValueError):
624
+ return str(value)
625
+ return value
2241
626
 
2242
- @_execute_with_retry
2243
- def _add_column_to_table(self, db_name: str, table_name: str, column: str, column_type: str, allow_null: bool = False):
2244
- """
2245
- 添加列到指定表。
2246
627
 
2247
- :param db_name: 数据库名
2248
- :param table_name: 表名
2249
- :param column: 需要添加的列名
2250
- :param column_type: 列的数据类型
2251
- :param allow_null: 是否允许空值,默认为False
2252
- """
2253
- db_name = self._validate_identifier(db_name, is_database=True)
2254
- table_name = self._validate_identifier(table_name)
2255
- column = self._validate_identifier(column)
2256
-
2257
- # 构建ALTER TABLE语句
2258
- null_constraint = "NULL" if allow_null else "NOT NULL"
2259
-
2260
- # 为新添加的列设置默认值
2261
- default_value = ""
2262
- if not allow_null:
2263
- column_type_lower = column_type.lower()
2264
- if any(t in column_type_lower for t in ['int', 'bigint', 'tinyint', 'smallint', 'mediumint']):
2265
- default_value = " DEFAULT 0"
2266
- elif any(t in column_type_lower for t in ['decimal', 'float', 'double']):
2267
- default_value = " DEFAULT 0.0"
2268
- elif any(t in column_type_lower for t in ['varchar', 'text', 'char', 'mediumtext', 'longtext']):
2269
- default_value = " DEFAULT 'none'"
2270
- elif 'timestamp' in column_type_lower:
2271
- # TIMESTAMP类型已经包含DEFAULT定义,不需要额外添加
2272
- default_value = ""
2273
- elif 'date' in column_type_lower:
2274
- if 'datetime' in column_type_lower:
2275
- default_value = " DEFAULT '2000-01-01 00:00:00'"
2276
- else:
2277
- default_value = " DEFAULT '2000-01-01'"
2278
- elif 'json' in column_type_lower:
2279
- default_value = " DEFAULT '{}'"
2280
-
2281
- # 对于TIMESTAMP类型,不添加额外的NULL约束,因为已经包含在类型定义中
2282
- if 'timestamp' in column_type.lower() and ('default' in column_type.lower() or 'current_timestamp' in column_type.lower()):
2283
- null_constraint = "" # TIMESTAMP类型已经包含完整定义
2284
- default_value = ""
2285
-
2286
- sql = f'ALTER TABLE `{db_name}`.`{table_name}` ADD COLUMN `{column}` {column_type} {null_constraint}{default_value}'
2287
-
2288
- conn = None
2289
- try:
2290
- with self._get_connection() as conn:
2291
- with conn.cursor() as cursor:
2292
- cursor.execute(sql)
2293
- conn.commit()
2294
- logger.debug('已为表添加列', {
2295
- '库': db_name,
2296
- '表': table_name,
2297
- '列': column,
2298
- '类型': column_type,
2299
- '允许空值': allow_null
2300
- })
2301
- except Exception as e:
2302
- logger.error('添加列失败', {
2303
- '库': db_name,
2304
- '表': table_name,
2305
- '列': column,
2306
- '类型': column_type,
2307
- '错误': str(e),
2308
- 'SQL': sql
2309
- })
2310
- if conn is not None:
2311
- conn.rollback()
2312
- raise
2313
-
2314
- def __enter__(self):
2315
- return self
2316
-
2317
- def close(self) -> None:
2318
- """
2319
- 关闭连接池并清理资源
2320
- 这个方法会安全地关闭数据库连接池,并清理相关资源。
2321
- 建议结束时手动调用此方法。
2322
- :raises: 可能抛出关闭连接时的异常
2323
- """
2324
- try:
2325
- if hasattr(self, 'pool') and self.pool is not None:
628
+ def retry_on_failure(max_retries: int = 3, delay: int = 1):
629
+ """重试装饰器"""
630
+ def decorator(func):
631
+ @wraps(func)
632
+ def wrapper(*args, **kwargs):
633
+ last_exception = None
634
+ for attempt in range(max_retries):
2326
635
  try:
2327
- self.pool = None
636
+ return func(*args, **kwargs)
637
+ except (pymysql.OperationalError, pymysql.InterfaceError) as e:
638
+ last_exception = e
639
+ if attempt < max_retries - 1:
640
+ logger.warning('操作失败,准备重试', {
641
+ '尝试次数': attempt + 1,
642
+ '错误': str(e)
643
+ })
644
+ time.sleep(delay * (attempt + 1))
645
+ continue
646
+ logger.error(f'操作重试{max_retries}次后失败', {'错误': str(e)})
647
+ raise
2328
648
  except Exception as e:
2329
- logger.warning('关闭连接池时出错', {'error': str(e)})
2330
- logger.debug('finished', {'uploader.py': '连接池关闭'})
2331
- except Exception as e:
2332
- logger.error('关闭连接池失败', {'uploader.py': str(e)})
2333
- raise
649
+ logger.error('操作失败', {'错误': str(e)})
650
+ raise
651
+ raise last_exception
652
+ return wrapper
653
+ return decorator
2334
654
 
2335
- def __exit__(self, exc_type, exc_val, exc_tb):
2336
- self.close()
2337
655
 
2338
- # @_execute_with_retry
2339
- def execute_query(self, sql: str, params: Optional[Tuple] = None) -> List[Dict]:
2340
- """
2341
- 执行查询SQL语句并返回结果
2342
-
2343
- :param sql: SQL查询语句
2344
- :param params: SQL参数,可选
2345
- :return: 查询结果列表,每个元素为字典格式
2346
- :raises: 可能抛出数据库相关异常
2347
- """
2348
- if not sql or not isinstance(sql, str):
2349
- logger.error('无效的SQL语句', {'sql': sql})
2350
- raise ValueError('SQL语句不能为空且必须是字符串')
2351
-
2352
- try:
2353
- with self._get_connection() as conn:
2354
- with conn.cursor() as cursor:
2355
- cursor.execute(sql, params)
2356
- results = cursor.fetchall()
2357
- logger.debug('查询执行成功', {
2358
- 'sql': self._shorten_for_log(sql, 100),
2359
- 'params': self._shorten_for_log(params, 50),
2360
- '结果数量': len(results)
2361
- })
2362
- return results
2363
- except Exception as e:
2364
- logger.error('执行查询时出错', {
2365
- 'sql': self._shorten_for_log(sql, 100),
2366
- 'params': self._shorten_for_log(params, 50),
2367
- 'error': str(e)
2368
- })
2369
- raise
2370
-
2371
- # @_execute_with_retry
2372
- def execute_update(self, sql: str, params: Optional[Tuple] = None) -> int:
656
+ class MySQLUploader:
657
+ """
658
+ MySQL数据上传器 - 重构版本
659
+
660
+ 特性:
661
+ - 自动为每个表添加id(BIGINT自增主键)、create_at、update_at时间戳列
662
+ - 支持自动建表、分表、数据类型推断
663
+ - 高可用连接池管理和重试机制
664
+ - 批量插入优化
665
+ """
666
+
667
+ def __init__(self, username: str, password: str, host: str = 'localhost',
668
+ port: int = 3306, charset: str = 'utf8mb4',
669
+ collation: str = 'utf8mb4_0900_ai_ci', pool_size: int = 5,
670
+ max_retries: int = 3, **kwargs):
2373
671
  """
2374
- 执行更新SQL语句(INSERT、UPDATE、DELETE)并返回影响的行数
672
+ 初始化MySQL上传器
2375
673
 
2376
- :param sql: SQL更新语句
2377
- :param params: SQL参数,可选
2378
- :return: 影响的行数
2379
- :raises: 可能抛出数据库相关异常
674
+ :param username: 数据库用户名
675
+ :param password: 数据库密码
676
+ :param host: 数据库主机地址
677
+ :param port: 数据库端口
678
+ :param charset: 字符集
679
+ :param collation: 排序规则
680
+ :param pool_size: 连接池大小
681
+ :param max_retries: 最大重试次数
2380
682
  """
2381
- if not sql or not isinstance(sql, str):
2382
- logger.error('无效的SQL语句', {'sql': sql})
2383
- raise ValueError('SQL语句不能为空且必须是字符串')
683
+ self.config = {
684
+ 'username': username,
685
+ 'password': password,
686
+ 'host': host,
687
+ 'port': port,
688
+ 'charset': charset,
689
+ 'pool_size': pool_size,
690
+ **kwargs
691
+ }
692
+ self.collation = collation
693
+ self.max_retries = max_retries
2384
694
 
2385
- conn = None
695
+ # 初始化组件
696
+ self.conn_mgr = DatabaseConnectionManager(self.config)
697
+ self.table_mgr = TableManager(self.conn_mgr, collation)
698
+ self.data_inserter = DataInserter(self.conn_mgr)
699
+
700
+ @retry_on_failure(max_retries=3)
701
+ def upload_data(self, db_name: str, table_name: str,
702
+ data: Union[Dict, List[Dict], pd.DataFrame],
703
+ set_typ: Optional[Dict[str, str]] = None,
704
+ allow_null: bool = False,
705
+ partition_by: Optional[str] = None,
706
+ partition_date_column: str = '日期',
707
+ update_on_duplicate: bool = False,
708
+ unique_keys: Optional[List[List[str]]] = None) -> bool:
709
+ """
710
+ 上传数据到MySQL数据库
711
+
712
+ 注意:系统会自动为每个表添加以下系统列:
713
+ - id: BIGINT自增主键
714
+ - create_at: 创建时间戳(插入时自动设置)
715
+ - update_at: 更新时间戳(插入和更新时自动设置)
716
+
717
+ :param db_name: 数据库名(会自动转为小写)
718
+ :param table_name: 表名(会自动转为小写)
719
+ :param data: 要上传的数据
720
+ :param set_typ: 列类型定义,如果为None则自动推断(无需包含系统列)
721
+ :param allow_null: 是否允许空值
722
+ :param partition_by: 分表方式('year'或'month')
723
+ :param partition_date_column: 分表日期列名
724
+ :param update_on_duplicate: 遇到重复数据时是否更新
725
+ :param unique_keys: 唯一约束列表(无需包含系统列)
726
+ :return: 上传是否成功
727
+ """
728
+ db_name = db_name.lower()
729
+ table_name = table_name.lower()
2386
730
  try:
2387
- with self._get_connection() as conn:
2388
- with conn.cursor() as cursor:
2389
- affected_rows = cursor.execute(sql, params)
2390
- conn.commit()
2391
- logger.debug('更新执行成功', {
2392
- 'sql': self._shorten_for_log(sql, 100),
2393
- 'params': self._shorten_for_log(params, 50),
2394
- '影响行数': affected_rows
2395
- })
2396
- return affected_rows
2397
- except Exception as e:
2398
- logger.error('执行更新时出错', {
2399
- 'sql': self._shorten_for_log(sql, 100),
2400
- 'params': self._shorten_for_log(params, 50),
2401
- 'error': str(e)
2402
- })
2403
- if conn is not None:
2404
- conn.rollback()
2405
- raise
2406
-
2407
- def _validate_unique_keys_format(self, unique_keys: Optional[List[List[str]]], db_name: str = None, table_name: str = None) -> Optional[List[List[str]]]:
2408
- """
2409
- 验证unique_keys参数的格式是否正确
2410
-
2411
- :param unique_keys: 唯一约束列表
2412
- :param db_name: 数据库名,用于日志记录
2413
- :param table_name: 表名,用于日志记录
2414
- :return: 验证后的unique_keys,如果验证失败则抛出异常
2415
- :raises ValueError: 当参数格式不正确时抛出
2416
- """
2417
- if unique_keys is None:
2418
- return None
731
+ start_time = time.time()
2419
732
 
2420
- if not isinstance(unique_keys, list):
2421
- error_msg = f"unique_keys参数必须是列表类型,当前类型: {type(unique_keys).__name__}"
2422
- logger.error(error_msg, {'库': db_name, '表': table_name, 'unique_keys': unique_keys})
2423
- raise ValueError(error_msg)
2424
-
2425
- # 检查是否为空列表
2426
- if len(unique_keys) == 0:
2427
- logger.debug('unique_keys为空列表,将忽略此参数', {'库': db_name, '表': table_name})
2428
- return None
2429
-
2430
- validated_keys = []
2431
- empty_groups_count = 0
2432
-
2433
- for i, key_group in enumerate(unique_keys):
2434
- # 检查每个元素是否为列表
2435
- if not isinstance(key_group, list):
2436
- error_msg = f"unique_keys[{i}]必须是列表类型,当前类型: {type(key_group).__name__},值: {key_group}"
2437
- logger.error(error_msg, {'库': db_name, '表': table_name, 'unique_keys': unique_keys})
2438
- raise ValueError(error_msg)
733
+ # 标准化数据
734
+ normalized_data = DataProcessor.normalize_data(data)
735
+ if not normalized_data:
736
+ logger.warning('数据为空,跳过上传')
737
+ return True
2439
738
 
2440
- # 检查是否为空列表
2441
- if len(key_group) == 0:
2442
- empty_groups_count += 1
2443
- logger.warning(f'unique_keys[{i}]为空列表,跳过', {'': db_name, '表': table_name})
2444
- continue
2445
-
2446
- # 检查每个列名是否为字符串
2447
- validated_group = []
2448
- for j, col_name in enumerate(key_group):
2449
- if not isinstance(col_name, str):
2450
- error_msg = f"unique_keys[{i}][{j}]必须是字符串类型,当前类型: {type(col_name).__name__},值: {col_name}"
2451
- logger.error(error_msg, {'库': db_name, '表': table_name, 'unique_keys': unique_keys})
2452
- raise ValueError(error_msg)
2453
-
2454
- # 检查是否为空字符串或纯空白字符
2455
- stripped_name = col_name.strip()
2456
- if not stripped_name:
2457
- error_msg = f"unique_keys[{i}][{j}]不能为空字符串或纯空白字符,原始值: '{col_name}'"
2458
- logger.error(error_msg, {'库': db_name, '表': table_name, 'unique_keys': unique_keys})
2459
- raise ValueError(error_msg)
2460
-
2461
- validated_group.append(stripped_name)
739
+ # 推断或验证列类型
740
+ if set_typ is None:
741
+ set_typ = DataTypeInferrer.infer_types_from_data(normalized_data)
742
+ logger.info('自动推断数据类型', {'类型映射': set_typ})
2462
743
 
2463
- # 去重并检查是否有重复列名
2464
- if len(validated_group) != len(set(validated_group)):
2465
- error_msg = f"unique_keys[{i}]中存在重复列名: {validated_group}"
2466
- logger.error(error_msg, {'库': db_name, '表': table_name, 'unique_keys': unique_keys})
2467
- raise ValueError(error_msg)
744
+ # 确保数据库存在
745
+ self.table_mgr.ensure_database_exists(db_name)
2468
746
 
2469
- validated_keys.append(validated_group)
2470
-
2471
- # 检查验证后的结果
2472
- if not validated_keys:
2473
- if empty_groups_count > 0:
2474
- logger.warning(f'unique_keys包含{empty_groups_count}个空列表,验证后为空,将忽略此参数', {
2475
- '库': db_name, '表': table_name, '空列表数量': empty_groups_count
2476
- })
747
+ # 处理分表逻辑
748
+ if partition_by:
749
+ return self._handle_partitioned_upload(
750
+ db_name, table_name, normalized_data, set_typ,
751
+ partition_by, partition_date_column, allow_null,
752
+ update_on_duplicate, unique_keys
753
+ )
2477
754
  else:
2478
- logger.warning('unique_keys验证后为空,将忽略此参数', {'库': db_name, '表': table_name})
2479
- return None
2480
-
2481
- logger.debug('unique_keys格式验证通过', {
2482
- '库': db_name,
2483
- '表': table_name,
2484
- '原始': unique_keys,
2485
- '验证后': validated_keys,
2486
- '跳过的空列表': empty_groups_count
2487
- })
2488
- return validated_keys
2489
-
2490
- def _validate_indexes_format(self, indexes: Optional[List[str]], db_name: str = None, table_name: str = None) -> Optional[List[str]]:
2491
- """
2492
- 验证indexes参数的格式是否正确
755
+ return self._handle_single_table_upload(
756
+ db_name, table_name, normalized_data, set_typ,
757
+ allow_null, update_on_duplicate, unique_keys
758
+ )
2493
759
 
2494
- :param indexes: 索引列列表
2495
- :param db_name: 数据库名,用于日志记录
2496
- :param table_name: 表名,用于日志记录
2497
- :return: 验证后的indexes,如果验证失败则抛出异常
2498
- :raises ValueError: 当参数格式不正确时抛出
2499
- """
2500
- if indexes is None:
2501
- return None
2502
-
2503
- if not isinstance(indexes, list):
2504
- error_msg = f"indexes参数必须是列表类型,当前类型: {type(indexes).__name__}"
2505
- logger.error(error_msg, {'库': db_name, '表': table_name, 'indexes': indexes})
2506
- raise ValueError(error_msg)
2507
-
2508
- # 检查是否为空列表
2509
- if len(indexes) == 0:
2510
- logger.debug('indexes为空列表,将忽略此参数', {'库': db_name, '表': table_name})
2511
- return None
2512
-
2513
- validated_indexes = []
2514
- empty_strings_count = 0
2515
-
2516
- for i, col_name in enumerate(indexes):
2517
- if not isinstance(col_name, str):
2518
- error_msg = f"indexes[{i}]必须是字符串类型,当前类型: {type(col_name).__name__},值: {col_name}"
2519
- logger.error(error_msg, {'库': db_name, '表': table_name, 'indexes': indexes})
2520
- raise ValueError(error_msg)
2521
-
2522
- # 检查是否为空字符串或纯空白字符
2523
- stripped_name = col_name.strip()
2524
- if not stripped_name:
2525
- empty_strings_count += 1
2526
- logger.warning(f'indexes[{i}]为空字符串或纯空白字符,跳过,原始值: "{col_name}"', {
2527
- '库': db_name, '表': table_name, 'indexes': indexes
2528
- })
2529
- continue
2530
-
2531
- validated_indexes.append(stripped_name)
760
+ except Exception as e:
761
+ logger.error('数据上传失败', {
762
+ '数据库': db_name,
763
+ '表名': table_name,
764
+ '错误': str(e)
765
+ })
766
+ return False
767
+
768
+ def _handle_single_table_upload(self, db_name: str, table_name: str,
769
+ data: List[Dict], set_typ: Dict[str, str],
770
+ allow_null: bool, update_on_duplicate: bool,
771
+ unique_keys: Optional[List[List[str]]]) -> bool:
772
+ """处理单表上传"""
773
+ # 确保表存在
774
+ if not self.table_mgr.table_exists(db_name, table_name):
775
+ self.table_mgr.create_table(db_name, table_name, set_typ,
776
+ unique_keys=unique_keys)
777
+
778
+ # 准备数据
779
+ prepared_data = DataProcessor.prepare_data_for_insert(
780
+ data, set_typ, allow_null
781
+ )
2532
782
 
2533
- # 去重
2534
- validated_indexes = list(dict.fromkeys(validated_indexes))
783
+ # 插入数据
784
+ inserted, skipped, failed = self.data_inserter.insert_data(
785
+ db_name, table_name, prepared_data, set_typ, update_on_duplicate
786
+ )
2535
787
 
2536
- # 检查验证后的结果
2537
- if not validated_indexes:
2538
- if empty_strings_count > 0:
2539
- logger.warning(f'indexes包含{empty_strings_count}个空字符串,验证后为空,将忽略此参数', {
2540
- '': db_name, '表': table_name, '空字符串数量': empty_strings_count
2541
- })
2542
- else:
2543
- logger.warning('indexes验证后为空,将忽略此参数', {'库': db_name, '表': table_name})
2544
- return None
2545
-
2546
- logger.debug('indexes格式验证通过', {
2547
- '库': db_name,
2548
- '表': table_name,
2549
- '原始': indexes,
2550
- '验证后': validated_indexes,
2551
- '跳过的空字符串': empty_strings_count
788
+ logger.info('单表上传完成', {
789
+ '数据库': db_name,
790
+ '表名': table_name,
791
+ '总数': len(data),
792
+ '插入': inserted,
793
+ '跳过': skipped,
794
+ '失败': failed
2552
795
  })
2553
- return validated_indexes
2554
-
2555
- def _validate_primary_keys_format(self, primary_keys: Optional[List[str]], db_name: str = None, table_name: str = None) -> Optional[List[str]]:
2556
- """
2557
- 验证primary_keys参数的格式是否正确
2558
796
 
2559
- :param primary_keys: 主键列列表
2560
- :param db_name: 数据库名,用于日志记录
2561
- :param table_name: 表名,用于日志记录
2562
- :return: 验证后的primary_keys,如果验证失败则抛出异常
2563
- :raises ValueError: 当参数格式不正确时抛出
2564
- """
2565
- if primary_keys is None:
2566
- return None
2567
-
2568
- if not isinstance(primary_keys, list):
2569
- error_msg = f"primary_keys参数必须是列表类型,当前类型: {type(primary_keys).__name__}"
2570
- logger.error(error_msg, {'库': db_name, '表': table_name, 'primary_keys': primary_keys})
2571
- raise ValueError(error_msg)
2572
-
2573
- # 检查是否为空列表
2574
- if len(primary_keys) == 0:
2575
- logger.debug('primary_keys为空列表,将忽略此参数', {'库': db_name, '表': table_name})
2576
- return None
2577
-
2578
- validated_keys = []
2579
- empty_strings_count = 0
2580
-
2581
- for i, col_name in enumerate(primary_keys):
2582
- if not isinstance(col_name, str):
2583
- error_msg = f"primary_keys[{i}]必须是字符串类型,当前类型: {type(col_name).__name__},值: {col_name}"
2584
- logger.error(error_msg, {'库': db_name, '表': table_name, 'primary_keys': primary_keys})
2585
- raise ValueError(error_msg)
2586
-
2587
- # 检查是否为空字符串或纯空白字符
2588
- stripped_name = col_name.strip()
2589
- if not stripped_name:
2590
- empty_strings_count += 1
2591
- logger.warning(f'primary_keys[{i}]为空字符串或纯空白字符,跳过,原始值: "{col_name}"', {
2592
- '库': db_name, '表': table_name, 'primary_keys': primary_keys
2593
- })
2594
- continue
2595
-
2596
- validated_keys.append(stripped_name)
2597
-
2598
- # 去重并检查是否有重复列名
2599
- if len(validated_keys) != len(set(validated_keys)):
2600
- error_msg = f"primary_keys中存在重复列名: {validated_keys}"
2601
- logger.error(error_msg, {'库': db_name, '表': table_name, 'primary_keys': primary_keys})
2602
- raise ValueError(error_msg)
2603
-
2604
- # 检查验证后的结果
2605
- if not validated_keys:
2606
- if empty_strings_count > 0:
2607
- logger.warning(f'primary_keys包含{empty_strings_count}个空字符串,验证后为空,将忽略此参数', {
2608
- '库': db_name, '表': table_name, '空字符串数量': empty_strings_count
2609
- })
2610
- else:
2611
- logger.warning('primary_keys验证后为空,将忽略此参数', {'库': db_name, '表': table_name})
2612
- return None
2613
-
2614
- logger.debug('primary_keys格式验证通过', {
2615
- '库': db_name,
2616
- '表': table_name,
2617
- '原始': primary_keys,
2618
- '验证后': validated_keys,
2619
- '跳过的空字符串': empty_strings_count
2620
- })
2621
- return validated_keys
2622
-
2623
- @staticmethod
2624
- def process_df_columns(
2625
- df: pd.DataFrame,
2626
- columns: List[str],
2627
- default_value: Any = 0
2628
- ) -> pd.DataFrame:
2629
- """
2630
- 处理DataFrame的列,补齐缺失的列并丢弃多余的列
2631
-
2632
- :param df: 要处理的DataFrame
2633
- :param columns: 所需的列名列表,注意不处理大小写
2634
- :param default_value: 缺失列的填充值,默认为None
2635
- :return: 处理后的DataFrame
2636
- """
2637
- if df is None or not isinstance(df, pd.DataFrame) or not isinstance(columns, list) or not columns:
2638
- return df
2639
-
2640
- # 获取当前列名
2641
- current_columns = list(df.columns)
2642
-
2643
- # 找出需要添加的列和需要删除的列
2644
- missing_columns = [col for col in columns if col not in current_columns]
2645
- extra_columns = [col for col in current_columns if col not in columns]
2646
-
2647
- # 复制DataFrame
2648
- result_df = df.copy()
2649
-
2650
- # 删除多余的列
2651
- if extra_columns:
2652
- result_df = result_df.drop(columns=extra_columns)
2653
-
2654
- # 添加缺失的列
2655
- if missing_columns:
2656
- for col in missing_columns:
2657
- result_df[col] = default_value
2658
-
2659
- # 按照指定顺序重新排列列
2660
- result_df = result_df.reindex(columns=columns)
2661
-
2662
- return result_df
2663
-
2664
- def _process_auto_timestamps(
2665
- self,
2666
- data: Union[Dict, List[Dict], pd.DataFrame],
2667
- set_typ: Dict[str, str],
2668
- db_name: str,
2669
- table_name: str
2670
- ) -> Tuple[Union[Dict, List[Dict], pd.DataFrame], Dict[str, str]]:
2671
- """
2672
- 处理自动时间戳功能
2673
-
2674
- :param data: 原始数据
2675
- :param set_typ: 列类型定义
2676
- :param db_name: 数据库名
2677
- :param table_name: 表名
2678
- :return: 处理后的数据和更新后的set_typ
2679
- """
2680
-
2681
- # 定义时间戳列名
2682
- created_col = '创建时间'
2683
- updated_col = '更新时间'
2684
-
2685
- # 复制set_typ以避免修改原始对象
2686
- updated_set_typ = set_typ.copy()
797
+ return failed == 0
798
+
799
+ def _handle_partitioned_upload(self, db_name: str, base_table_name: str,
800
+ data: List[Dict], set_typ: Dict[str, str],
801
+ partition_by: str, partition_date_column: str,
802
+ allow_null: bool, update_on_duplicate: bool,
803
+ unique_keys: Optional[List[List[str]]]) -> bool:
804
+ """处理分表上传"""
805
+ # 按日期分区数据
806
+ partitioned_data = DataProcessor.partition_data_by_date(
807
+ data, partition_date_column, partition_by
808
+ )
2687
809
 
2688
- # 使用MySQL的CURRENT_TIMESTAMP功能,按固定顺序添加时间戳列
2689
- # 创建时间:插入时自动设置,更新时不变
2690
- updated_set_typ[created_col] = 'TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP'
2691
- # 更新时间:插入和更新时都自动设置为当前时间
2692
- updated_set_typ[updated_col] = 'TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP'
810
+ total_success = True
2693
811
 
2694
- # 处理DataFrame格式的数据
2695
- if hasattr(data, 'shape') and hasattr(data, 'columns'):
2696
- import pandas as pd
2697
- df = data.copy()
2698
-
2699
- # 移除原始数据中可能存在的时间戳列,让MySQL自动处理
2700
- columns_to_remove = []
2701
- for col in df.columns:
2702
- if col in [created_col, updated_col]:
2703
- columns_to_remove.append(col)
812
+ for partition_suffix, partition_data in partitioned_data.items():
813
+ partition_table_name = f"{base_table_name}_{partition_suffix}"
2704
814
 
2705
- if columns_to_remove:
2706
- df = df.drop(columns=columns_to_remove)
815
+ success = self._handle_single_table_upload(
816
+ db_name, partition_table_name, partition_data, set_typ,
817
+ allow_null, update_on_duplicate, unique_keys
818
+ )
2707
819
 
2708
- # 不再手动添加时间戳列,让MySQL的CURRENT_TIMESTAMP自动处理
2709
- return df, updated_set_typ
820
+ if not success:
821
+ total_success = False
822
+
823
+ logger.info('分表上传完成', {
824
+ '数据库': db_name,
825
+ '基础表名': base_table_name,
826
+ '分区数': len(partitioned_data),
827
+ '总体成功': total_success
828
+ })
2710
829
 
2711
- # 处理字典或字典列表格式的数据
2712
- else:
2713
- # 确保data是列表格式
2714
- if isinstance(data, dict):
2715
- data_list = [data]
2716
- is_single_dict = True
2717
- else:
2718
- data_list = data
2719
- is_single_dict = False
2720
-
2721
- # 处理每一行数据
2722
- processed_data = []
2723
- for row in data_list:
2724
- new_row = {}
2725
-
2726
- # 复制原始数据,但跳过可能存在的时间戳列
2727
- for key, value in row.items():
2728
- if key not in [created_col, updated_col]:
2729
- new_row[key] = value
2730
-
2731
- # 不再手动添加时间戳,让MySQL的CURRENT_TIMESTAMP自动处理
2732
- processed_data.append(new_row)
2733
-
2734
- # 如果原始数据是单个字典,返回单个字典
2735
- if is_single_dict:
2736
- return processed_data[0], updated_set_typ
2737
- else:
2738
- return processed_data, updated_set_typ
2739
-
830
+ return total_success
831
+
832
+ def close(self):
833
+ """关闭连接"""
834
+ if self.conn_mgr:
835
+ self.conn_mgr.close()
836
+
837
+ def __del__(self):
838
+ try:
839
+ self.close()
840
+ except:
841
+ pass
842
+
843
+ def __enter__(self):
844
+ return self
845
+
846
+ def __exit__(self, exc_type, exc_val, exc_tb):
847
+ self.close()
2740
848
 
2741
- def main():
2742
- dir_path = os.path.expanduser("~")
2743
- parser = myconf.ConfigParser()
2744
- host, port, username, password = parser.get_section_values(
2745
- file_path=os.path.join(dir_path, 'spd.txt'),
2746
- section='mysql',
2747
- keys=['host', 'port', 'username', 'password'],
2748
- )
2749
- host = 'localhost'
2750
849
 
850
+ # 使用示例
851
+ if __name__ == '__main__':
852
+ # 示例代码
2751
853
  uploader = MySQLUploader(
2752
- username=username,
2753
- password=password,
2754
- host=host,
2755
- port=int(port),
854
+ username='your_username',
855
+ password='your_password',
856
+ host='localhost',
857
+ port=3306
2756
858
  )
2757
-
2758
- # 定义列和数据类型
2759
- set_typ = {
859
+
860
+ # 示例数据
861
+ sample_data = [
862
+ {'name': 'Alice', 'age': 25, 'salary': 50000.0, '日期': '2023-01-01'},
863
+ {'name': 'Bob', 'age': 30, 'salary': 60000.0, '日期': '2023-01-02'},
864
+ ]
865
+
866
+ # 定义列类型(系统会自动添加id、create_at、update_at列)
867
+ column_types = {
2760
868
  'name': 'VARCHAR(255)',
2761
869
  'age': 'INT',
2762
870
  'salary': 'DECIMAL(10,2)',
2763
- '日期': 'DATE',
2764
- 'shop': None,
871
+ '日期': 'DATE'
2765
872
  }
2766
-
2767
- # 准备数据
2768
- data = [
2769
- {'日期': '2023-01-8', 'name': 'JACk', 'AGE': '24', 'salary': 555.1545},
2770
- {'日期': '2023-01-15', 'name': 'Alice', 'AGE': 35, 'salary': '100'},
2771
- {'日期': '2023-01-15', 'name': 'Alice', 'AGE': 5, 'salary': 15478},
2772
- {'日期': '2023-02-20', 'name': 'Bob', 'AGE': 25, 'salary': 45000.75},
2773
- ]
2774
-
2775
- # 测试参数验证功能
2776
- print("=== 测试参数验证功能 ===")
2777
-
2778
- # 正确的格式
2779
- print("1. 测试正确的unique_keys格式:")
2780
- try:
2781
- valid_unique_keys = [['日期', 'name'], ['age']]
2782
- result = uploader._validate_unique_keys_format(valid_unique_keys, 'test_db', 'test_table')
2783
- print(f" 通过: {result}")
2784
- except Exception as e:
2785
- print(f" 失败: {e}")
2786
-
2787
- # 错误的格式 - 缺少一层嵌套
2788
- print("2. 测试错误的unique_keys格式 (缺少嵌套):")
2789
- try:
2790
- invalid_unique_keys = ['日期', 'name'] # 错误:应该是 [['日期', 'name']]
2791
- result = uploader._validate_unique_keys_format(invalid_unique_keys, 'test_db', 'test_table')
2792
- print(f" 通过: {result}")
2793
- except Exception as e:
2794
- print(f" 正确捕获错误: {e}")
2795
873
 
2796
- # 错误的格式 - 包含非字符串元素
2797
- print("3. 测试错误的unique_keys格式 (非字符串元素):")
2798
- try:
2799
- invalid_unique_keys = [['日期', 123]] # 错误:123不是字符串
2800
- result = uploader._validate_unique_keys_format(invalid_unique_keys, 'test_db', 'test_table')
2801
- print(f" 通过: {result}")
2802
- except Exception as e:
2803
- print(f" 正确捕获错误: {e}")
2804
-
2805
- # 错误的格式 - 空字符串
2806
- print("4. 测试错误的unique_keys格式 (空字符串):")
2807
- try:
2808
- invalid_unique_keys = [['日期', '']] # 错误:空字符串
2809
- result = uploader._validate_unique_keys_format(invalid_unique_keys, 'test_db', 'test_table')
2810
- print(f" 通过: {result}")
2811
- except Exception as e:
2812
- print(f" 正确捕获错误: {e}")
2813
-
2814
- # 错误的格式 - 重复列名
2815
- print("5. 测试错误的unique_keys格式 (重复列名):")
2816
- try:
2817
- invalid_unique_keys = [['日期', '日期']] # 错误:重复列名
2818
- result = uploader._validate_unique_keys_format(invalid_unique_keys, 'test_db', 'test_table')
2819
- print(f" 通过: {result}")
2820
- except Exception as e:
2821
- print(f" 正确捕获错误: {e}")
2822
-
2823
- # 空值测试 - 空列表
2824
- print("6. 测试空值情况 - 空列表:")
2825
- try:
2826
- empty_list = []
2827
- result = uploader._validate_unique_keys_format(empty_list, 'test_db', 'test_table')
2828
- print(f" 通过: {result}")
2829
- except Exception as e:
2830
- print(f" 失败: {e}")
2831
-
2832
- # 空值测试 - 包含空列表
2833
- print("7. 测试空值情况 - 包含空列表 [[]]:")
2834
- try:
2835
- empty_nested = [[]]
2836
- result = uploader._validate_unique_keys_format(empty_nested, 'test_db', 'test_table')
2837
- print(f" 通过: {result}")
2838
- except Exception as e:
2839
- print(f" 失败: {e}")
2840
-
2841
- # 空值测试 - 混合空列表和有效列表
2842
- print("8. 测试空值情况 - 混合空列表和有效列表 [[], ['col1']]:")
2843
- try:
2844
- mixed_empty = [[], ['col1']]
2845
- result = uploader._validate_unique_keys_format(mixed_empty, 'test_db', 'test_table')
2846
- print(f" 通过: {result}")
2847
- except Exception as e:
2848
- print(f" 失败: {e}")
2849
-
2850
- # 空值测试 - 包含空字符串的列表
2851
- print("9. 测试空值情况 - 包含空字符串的列表 [[''], ['col1']]:")
2852
- try:
2853
- empty_string_list = [[''], ['col1']]
2854
- result = uploader._validate_unique_keys_format(empty_string_list, 'test_db', 'test_table')
2855
- print(f" 通过: {result}")
2856
- except Exception as e:
2857
- print(f" 正确捕获错误: {e}")
2858
-
2859
- # 空值测试 - 包含纯空白字符的列表
2860
- print("10. 测试空值情况 - 包含纯空白字符的列表 [[' '], ['col1']]:")
2861
- try:
2862
- whitespace_list = [[' '], ['col1']]
2863
- result = uploader._validate_unique_keys_format(whitespace_list, 'test_db', 'test_table')
2864
- print(f" 通过: {result}")
2865
- except Exception as e:
2866
- print(f" 正确捕获错误: {e}")
2867
-
2868
- # 测试indexes的空值处理
2869
- print("\n=== 测试indexes空值处理 ===")
2870
- print("11. 测试indexes包含空字符串 ['', 'col1']:")
2871
- try:
2872
- indexes_with_empty = ['', 'col1']
2873
- result = uploader._validate_indexes_format(indexes_with_empty, 'test_db', 'test_table')
2874
- print(f" 通过: {result}")
2875
- except Exception as e:
2876
- print(f" 失败: {e}")
2877
-
2878
- # 测试primary_keys的空值处理
2879
- print("12. 测试primary_keys包含空字符串 ['', 'col1']:")
2880
- try:
2881
- primary_keys_with_empty = ['', 'col1']
2882
- result = uploader._validate_primary_keys_format(primary_keys_with_empty, 'test_db', 'test_table')
2883
- print(f" 通过: {result}")
2884
- except Exception as e:
2885
- print(f" 失败: {e}")
2886
-
2887
- # 上传数据(使用正确的格式)
2888
- print("\n=== 开始上传数据 ===")
2889
- uploader.upload_data(
2890
- db_name='测试库',
2891
- table_name='测试表',
2892
- data=data,
2893
- set_typ=set_typ, # 定义列和数据类型
2894
- primary_keys=[], # 创建唯一主键
2895
- check_duplicate=False, # 检查重复数据
2896
- duplicate_columns=[], # 指定排重的组合键
2897
- update_on_duplicate=True, # 更新旧数据
2898
- allow_null=False, # 允许插入空值
2899
- partition_by='year', # 分表方式
2900
- partition_date_column='日期', # 用于分表的日期列名,默认为'日期'
2901
- indexes=[], # 普通索引列
2902
- transaction_mode='row', # 事务模式
2903
- unique_keys=[['日期', 'name', 'age']], # 唯一约束列表 - 正确的格式
874
+ # 上传数据
875
+ success = uploader.upload_data(
876
+ db_name='test_db',
877
+ table_name='test_table',
878
+ data=sample_data,
879
+ set_typ=column_types,
880
+ allow_null=False,
881
+ update_on_duplicate=True,
882
+ unique_keys=[['name', '日期']]
2904
883
  )
2905
-
884
+
2906
885
  uploader.close()
2907
-
2908
-
2909
- if __name__ == '__main__':
2910
- # main()
2911
- pass
886
+ print(f"上传结果: {success}")