mdbq 3.9.0__py3-none-any.whl → 3.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.9.0'
1
+ VERSION = '3.9.1'
mdbq/mysql/mysql.py CHANGED
@@ -11,10 +11,8 @@ from sqlalchemy import create_engine
11
11
  import os
12
12
  import logging
13
13
  from mdbq.other import otk
14
-
15
14
  from dbutils.pooled_db import PooledDB
16
15
  from typing import Union, List, Dict, Optional, Any, Tuple
17
-
18
16
  warnings.filterwarnings('ignore')
19
17
  """
20
18
  建表流程:
@@ -59,7 +57,11 @@ class MySQLUploader:
59
57
  log_level: str = 'ERROR',
60
58
  max_retries: int = 10,
61
59
  retry_interval: int = 10,
62
- pool_size: int = 5
60
+ pool_size: int = 5,
61
+ connect_timeout: int = 10,
62
+ read_timeout: int = 30,
63
+ write_timeout: int = 30,
64
+ ssl: Optional[Dict] = None
63
65
  ):
64
66
  """
65
67
  初始化MySQL上传工具
@@ -75,6 +77,10 @@ class MySQLUploader:
75
77
  :param max_retries: 最大重试次数,默认为10
76
78
  :param retry_interval: 重试间隔(秒),默认为10
77
79
  :param pool_size: 连接池大小,默认为5
80
+ :param connect_timeout: 连接超时(秒),默认为10
81
+ :param read_timeout: 读取超时(秒),默认为30
82
+ :param write_timeout: 写入超时(秒),默认为30
83
+ :param ssl: SSL配置字典,默认为None
78
84
  """
79
85
  self.username = username
80
86
  self.password = password
@@ -82,37 +88,90 @@ class MySQLUploader:
82
88
  self.port = port
83
89
  self.charset = charset
84
90
  self.collation = collation
85
- self.max_retries = max_retries
86
- self.retry_interval = retry_interval
87
- self.pool_size = pool_size
91
+ self.max_retries = max(max_retries, 1) # 至少重试1次
92
+ self.retry_interval = max(retry_interval, 1) # 至少间隔1秒
93
+ self.pool_size = max(pool_size, 1) # 至少1个连接
94
+ self.connect_timeout = connect_timeout
95
+ self.read_timeout = read_timeout
96
+ self.write_timeout = write_timeout
97
+ self.ssl = ssl
98
+ self._prepared_statements = {} # 预处理语句缓存
99
+ self._max_cached_statements = 100 # 最大缓存语句数
88
100
 
89
101
  # 初始化日志
90
102
  if enable_logging:
91
103
  self._init_logging(log_level)
104
+ else:
105
+ self.logger = None
92
106
 
93
107
  # 创建连接池
94
108
  self.pool = self._create_connection_pool()
95
109
 
96
110
  def _init_logging(self, log_level: str):
97
111
  """初始化日志配置"""
112
+ valid_levels = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
113
+ level = log_level.upper() if log_level.upper() in valid_levels else 'ERROR'
114
+
98
115
  logging.basicConfig(
99
- level=getattr(logging, log_level.upper(), logging.ERROR),
100
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
116
+ level=getattr(logging, level),
117
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
118
+ handlers=[logging.StreamHandler()]
101
119
  )
102
120
  self.logger = logging.getLogger('MySQLUploader')
103
121
 
104
- def _create_connection_pool(self):
122
+ def _create_connection_pool(self) -> PooledDB:
105
123
  """创建数据库连接池"""
106
- return PooledDB(
107
- creator=pymysql,
108
- host=self.host,
109
- port=self.port,
110
- user=self.username,
111
- password=self.password,
112
- charset=self.charset,
113
- maxconnections=self.pool_size,
114
- cursorclass=pymysql.cursors.DictCursor
115
- )
124
+ pool_params = {
125
+ 'creator': pymysql,
126
+ 'host': self.host,
127
+ 'port': self.port,
128
+ 'user': self.username,
129
+ 'password': self.password,
130
+ 'charset': self.charset,
131
+ 'cursorclass': pymysql.cursors.DictCursor,
132
+ 'maxconnections': self.pool_size,
133
+ 'ping': 7, # 连接检查
134
+ 'connect_timeout': self.connect_timeout,
135
+ 'read_timeout': self.read_timeout,
136
+ 'write_timeout': self.write_timeout,
137
+ 'autocommit': False
138
+ }
139
+
140
+ if self.ssl:
141
+ required_keys = {'ca', 'cert', 'key'}
142
+ if not all(k in self.ssl for k in required_keys):
143
+ raise ValueError("SSL配置必须包含ca、cert和key")
144
+ pool_params['ssl'] = {
145
+ 'ca': self.ssl['ca'],
146
+ 'cert': self.ssl['cert'],
147
+ 'key': self.ssl['key'],
148
+ 'check_hostname': self.ssl.get('check_hostname', False)
149
+ }
150
+
151
+ try:
152
+ pool = PooledDB(**pool_params)
153
+ return pool
154
+ except Exception as e:
155
+ if self.logger:
156
+ self.logger.error("连接池创建失败: %s", str(e))
157
+ raise ConnectionError(f"连接池创建失败: {str(e)}")
158
+
159
+ def _validate_datetime(self, value):
160
+ formats = [
161
+ '%Y-%m-%d %H:%M:%S',
162
+ '%Y-%m-%d',
163
+ '%Y/%m/%d %H:%M:%S',
164
+ '%Y/%m/%d',
165
+ '%Y%m%d',
166
+ '%Y-%m-%dT%H:%M:%S', # ISO格式
167
+ '%Y-%m-%d %H:%M:%S.%f' # 带毫秒
168
+ ]
169
+ for fmt in formats:
170
+ try:
171
+ return datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d %H:%M:%S')
172
+ except ValueError:
173
+ continue
174
+ raise ValueError(f"无效的日期格式: {value}")
116
175
 
117
176
  def _validate_identifier(self, identifier: str) -> str:
118
177
  """
@@ -121,14 +180,31 @@ class MySQLUploader:
121
180
 
122
181
  :param identifier: 要验证的标识符
123
182
  :return: 清理后的安全标识符
183
+ :raises ValueError: 如果标识符无效
124
184
  """
125
185
  if not identifier or not isinstance(identifier, str):
126
- raise ValueError(f"Invalid identifier: {identifier}")
186
+ error_msg = f"无效的标识符: {identifier}"
187
+ if self.logger:
188
+ self.logger.error(error_msg)
189
+ raise ValueError(error_msg)
127
190
 
128
- # 移除可能有害的字符
129
- cleaned = re.sub(r'[^a-zA-Z0-9_$]', '', identifier)
191
+ # 移除可能有害的字符,只保留字母、数字、下划线和美元符号
192
+ cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '', identifier)
130
193
  if not cleaned:
131
- raise ValueError(f"Invalid identifier after cleaning: {identifier}")
194
+ error_msg = f"无法清理异常标识符: {identifier}"
195
+ if self.logger:
196
+ self.logger.error(error_msg)
197
+ raise ValueError(error_msg)
198
+
199
+ # 检查是否为MySQL保留字
200
+ mysql_keywords = {
201
+ 'select', 'insert', 'update', 'delete', 'from', 'where', 'and', 'or',
202
+ 'not', 'like', 'in', 'is', 'null', 'true', 'false', 'between'
203
+ }
204
+ if cleaned.lower() in mysql_keywords:
205
+ if self.logger:
206
+ self.logger.warning("存在MySQL保留字: %s", cleaned)
207
+ return f"`{cleaned}`"
132
208
 
133
209
  return cleaned
134
210
 
@@ -139,25 +215,42 @@ class MySQLUploader:
139
215
  :param value: 要验证的值
140
216
  :param column_type: 列的数据类型
141
217
  :return: 清理后的值
218
+ :raises ValueError: 如果值转换失败
142
219
  """
143
220
  if value is None:
144
221
  return None
145
222
 
146
223
  try:
147
- if 'int' in column_type.lower():
224
+ column_type_lower = column_type.lower()
225
+
226
+ if 'int' in column_type_lower:
148
227
  return int(value) if value is not None else None
149
- elif 'float' in column_type.lower() or 'double' in column_type.lower() or 'decimal' in column_type.lower():
228
+ elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
150
229
  return float(value) if value is not None else None
151
- elif 'date' in column_type.lower() or 'time' in column_type.lower():
230
+ elif '日期' in column_type_lower or 'time' in column_type_lower:
152
231
  if isinstance(value, (datetime.datetime, pd.Timestamp)):
153
232
  return value.strftime('%Y-%m-%d %H:%M:%S')
233
+ elif isinstance(value, str):
234
+ try:
235
+ return self._validate_datetime(value) # 使用专门的日期验证方法
236
+ except ValueError as e:
237
+ raise ValueError(f"无效日期格式: {value} - {str(e)}")
154
238
  return str(value)
155
- elif 'char' in column_type.lower() or 'text' in column_type.lower():
239
+ elif 'char' in column_type_lower or 'text' in column_type_lower:
240
+ # 防止SQL注入
241
+ if isinstance(value, str):
242
+ return value.replace('\\', '\\\\').replace("'", "\\'")
156
243
  return str(value)
244
+ elif 'json' in column_type_lower:
245
+ import json
246
+ return json.dumps(value) if value is not None else None
157
247
  else:
158
248
  return value
159
249
  except (ValueError, TypeError) as e:
160
- raise ValueError(f"Failed to convert value {value} to type {column_type}: {str(e)}")
250
+ error_msg = f"数据类型转换异常 {value} to type {column_type}: {str(e)}"
251
+ if self.logger:
252
+ self.logger.error(error_msg)
253
+ raise ValueError(error_msg)
161
254
 
162
255
  def _execute_with_retry(self, func, *args, **kwargs):
163
256
  """
@@ -167,6 +260,7 @@ class MySQLUploader:
167
260
  :param args: 位置参数
168
261
  :param kwargs: 关键字参数
169
262
  :return: 函数执行结果
263
+ :raises Exception: 如果所有重试都失败
170
264
  """
171
265
 
172
266
  @wraps(func)
@@ -174,41 +268,95 @@ class MySQLUploader:
174
268
  last_exception = None
175
269
  for attempt in range(self.max_retries):
176
270
  try:
177
- return func(*args, **kwargs)
178
- except pymysql.OperationalError as e:
271
+ result = func(*args, **kwargs)
272
+ if attempt > 0 and self.logger:
273
+ self.logger.info("Operation succeeded after %d retries", attempt)
274
+ return result
275
+ except (pymysql.OperationalError, pymysql.err.MySQLError) as e:
179
276
  last_exception = e
180
277
  if attempt < self.max_retries - 1:
181
- time.sleep(self.retry_interval)
278
+ wait_time = self.retry_interval * (attempt + 1)
279
+ if self.logger:
280
+ self.logger.warning(
281
+ "尝试 %d/%d 失败: %s. %d秒后重试...",
282
+ attempt + 1, self.max_retries, str(e), wait_time
283
+ )
284
+ time.sleep(wait_time)
182
285
  # 尝试重新连接
183
- self.pool = self._create_connection_pool()
286
+ try:
287
+ self.pool = self._create_connection_pool()
288
+ except Exception as reconnect_error:
289
+ if self.logger:
290
+ self.logger.error("重连失败: %s", str(reconnect_error))
184
291
  continue
185
- raise last_exception if last_exception else Exception("Unknown error occurred")
292
+ else:
293
+ if self.logger:
294
+ self.logger.error(
295
+ "Operation failed after %d attempts. Last error: %s",
296
+ self.max_retries, str(e)
297
+ )
298
+ except pymysql.IntegrityError as e:
299
+ # 完整性错误通常不需要重试
300
+ if self.logger:
301
+ self.logger.error("完整性约束错误: %s", str(e))
302
+ raise e
303
+ except Exception as e:
304
+ last_exception = e
305
+ if self.logger:
306
+ self.logger.error("发生意外错误: %s", str(e))
307
+ break
308
+
309
+ raise last_exception if last_exception else Exception("发生未知错误")
186
310
 
187
311
  return wrapper(*args, **kwargs)
188
312
 
189
313
  def _get_connection(self):
190
314
  """从连接池获取连接"""
191
- return self.pool.connection()
315
+ try:
316
+ conn = self.pool.connection()
317
+ if self.logger:
318
+ self.logger.debug("成功获取数据库连接")
319
+ return conn
320
+ except Exception as e:
321
+ if self.logger:
322
+ self.logger.error("连接数据库失败: %s", str(e))
323
+ raise ConnectionError(f"连接数据库失败: {str(e)}")
192
324
 
193
325
  def _check_database_exists(self, db_name: str) -> bool:
194
326
  """检查数据库是否存在"""
195
327
  db_name = self._validate_identifier(db_name)
196
328
  sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
197
329
 
198
- with self._get_connection() as conn:
199
- with conn.cursor() as cursor:
200
- cursor.execute(sql, (db_name,))
201
- return bool(cursor.fetchone())
330
+ try:
331
+ with self._get_connection() as conn:
332
+ with conn.cursor() as cursor:
333
+ cursor.execute(sql, (db_name,))
334
+ exists = bool(cursor.fetchone())
335
+ if self.logger:
336
+ self.logger.debug("数据库 %s 已存在: %s", db_name, exists)
337
+ return exists
338
+ except Exception as e:
339
+ if self.logger:
340
+ self.logger.error("检查数据库是否存在时出错: %s", str(e))
341
+ raise
202
342
 
203
343
  def _create_database(self, db_name: str):
204
344
  """创建数据库"""
205
345
  db_name = self._validate_identifier(db_name)
206
346
  sql = f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET {self.charset} COLLATE {self.collation}"
207
347
 
208
- with self._get_connection() as conn:
209
- with conn.cursor() as cursor:
210
- cursor.execute(sql)
211
- conn.commit()
348
+ try:
349
+ with self._get_connection() as conn:
350
+ with conn.cursor() as cursor:
351
+ cursor.execute(sql)
352
+ conn.commit()
353
+ if self.logger:
354
+ self.logger.info("数据库 %s 创建成功", db_name)
355
+ except Exception as e:
356
+ if self.logger:
357
+ self.logger.error("无法创建数据库 %s: %s", db_name, str(e))
358
+ conn.rollback()
359
+ raise
212
360
 
213
361
  def _check_table_exists(self, db_name: str, table_name: str) -> bool:
214
362
  """检查表是否存在"""
@@ -220,10 +368,16 @@ class MySQLUploader:
220
368
  WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
221
369
  """
222
370
 
223
- with self._get_connection() as conn:
224
- with conn.cursor() as cursor:
225
- cursor.execute(sql, (db_name, table_name))
226
- return bool(cursor.fetchone())
371
+ try:
372
+ with self._get_connection() as conn:
373
+ with conn.cursor() as cursor:
374
+ cursor.execute(sql, (db_name, table_name))
375
+ exists = bool(cursor.fetchone())
376
+ return exists
377
+ except Exception as e:
378
+ if self.logger:
379
+ self.logger.error("检查数据表是否存在时发生未知错误: %s", str(e))
380
+ raise
227
381
 
228
382
  def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
229
383
  """获取表的列名和数据类型"""
@@ -233,69 +387,21 @@ class MySQLUploader:
233
387
  SELECT COLUMN_NAME, DATA_TYPE
234
388
  FROM INFORMATION_SCHEMA.COLUMNS
235
389
  WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
390
+ ORDER BY ORDINAL_POSITION
236
391
  """
237
392
 
238
- with self._get_connection() as conn:
239
- with conn.cursor() as cursor:
240
- cursor.execute(sql, (db_name, table_name))
241
- return {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
242
-
243
- def _create_table(
244
- self,
245
- db_name: str,
246
- table_name: str,
247
- columns: Dict[str, str],
248
- primary_keys: Optional[List[str]] = None,
249
- date_column: Optional[str] = None
250
- ):
251
- """
252
- 创建数据表
253
-
254
- :param db_name: 数据库名
255
- :param table_name: 表名
256
- :param columns: 列名和数据类型字典 {列名: 数据类型}
257
- :param primary_keys: 主键列列表
258
- :param date_column: 日期列名,如果存在将设置为索引
259
- """
260
- db_name = self._validate_identifier(db_name)
261
- table_name = self._validate_identifier(table_name)
262
-
263
- if not columns:
264
- raise ValueError("No columns specified for table creation")
265
-
266
- # 构建列定义SQL
267
- column_defs = []
268
- for col_name, col_type in columns.items():
269
- safe_col_name = self._validate_identifier(col_name)
270
- col_def = f"`{safe_col_name}` {col_type}"
271
- column_defs.append(col_def)
272
-
273
- # 添加主键定义
274
- primary_key_sql = ""
275
- if primary_keys:
276
- safe_primary_keys = [self._validate_identifier(pk) for pk in primary_keys]
277
- primary_key_sql = f", PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
278
-
279
- # 构建完整SQL
280
- sql = f"""
281
- CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
282
- {','.join(column_defs)}
283
- {primary_key_sql}
284
- ) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
285
- """
286
-
287
- with self._get_connection() as conn:
288
- with conn.cursor() as cursor:
289
- cursor.execute(sql)
290
-
291
- # 如果存在日期列,添加索引
292
- if date_column and date_column in columns:
293
- safe_date_col = self._validate_identifier(date_column)
294
- index_sql = f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_date_col}` (`{safe_date_col}`)"
393
+ try:
394
+ with self._get_connection() as conn:
295
395
  with conn.cursor() as cursor:
296
- cursor.execute(index_sql)
297
-
298
- conn.commit()
396
+ cursor.execute(sql, (db_name, table_name))
397
+ columns = {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
398
+ if self.logger:
399
+ self.logger.debug("获取表 %s.%s 的列信息: %s", db_name, table_name, columns)
400
+ return columns
401
+ except Exception as e:
402
+ if self.logger:
403
+ self.logger.error("无法获取表列信息: %s", str(e))
404
+ raise
299
405
 
300
406
  def _prepare_data(
301
407
  self,
@@ -310,31 +416,51 @@ class MySQLUploader:
310
416
  :param columns: 列名和数据类型字典 {列名: 数据类型}
311
417
  :param allow_null: 是否允许空值
312
418
  :return: 准备好的数据列表
419
+ :raises ValueError: 如果数据验证失败
313
420
  """
314
421
  # 统一数据格式为字典列表
315
422
  if isinstance(data, pd.DataFrame):
316
- data = data.to_dict('records')
423
+ try:
424
+ data = data.replace({pd.NA: None}).to_dict('records')
425
+ except Exception as e:
426
+ if self.logger:
427
+ self.logger.error("Failed to convert DataFrame to dict: %s", str(e))
428
+ raise ValueError(f"Failed to convert DataFrame to dict: {str(e)}")
317
429
  elif isinstance(data, dict):
318
430
  data = [data]
319
-
320
- if not isinstance(data, list) or not all(isinstance(item, dict) for item in data):
321
- raise ValueError("Data must be a dict, list of dicts, or DataFrame")
431
+ elif not isinstance(data, list) or not all(isinstance(item, dict) for item in data):
432
+ error_msg = "Data must be a dict, list of dicts, or DataFrame"
433
+ if self.logger:
434
+ self.logger.error(error_msg)
435
+ raise ValueError(error_msg)
322
436
 
323
437
  prepared_data = []
324
- for row in data:
438
+ for row_idx, row in enumerate(data, 1):
325
439
  prepared_row = {}
326
440
  for col_name, col_type in columns.items():
441
+ # 跳过id列,不允许外部传入id
442
+ if col_name.lower() == 'id':
443
+ continue
444
+
327
445
  if col_name not in row:
328
446
  if not allow_null:
329
- raise ValueError(f"Missing required column '{col_name}' in data")
447
+ error_msg = f"Row {row_idx}: Missing required column '{col_name}' in data"
448
+ if self.logger:
449
+ self.logger.error(error_msg)
450
+ raise ValueError(error_msg)
330
451
  prepared_row[col_name] = None
331
452
  else:
332
453
  try:
333
454
  prepared_row[col_name] = self._validate_value(row[col_name], col_type)
334
455
  except ValueError as e:
335
- raise ValueError(f"Error in column '{col_name}': {str(e)}")
456
+ error_msg = f"Row {row_idx}, column '{col_name}': {str(e)}"
457
+ if self.logger:
458
+ self.logger.error(error_msg)
459
+ raise ValueError(error_msg)
336
460
  prepared_data.append(prepared_row)
337
461
 
462
+ if self.logger:
463
+ self.logger.debug("已准备 %d 行数据", len(prepared_data))
338
464
  return prepared_data
339
465
 
340
466
  def _get_partition_table_name(self, table_name: str, date_value: str, partition_by: str) -> str:
@@ -345,6 +471,7 @@ class MySQLUploader:
345
471
  :param date_value: 日期值
346
472
  :param partition_by: 分表方式 ('year' 或 'month')
347
473
  :return: 分表名称
474
+ :raises ValueError: 如果日期格式无效或分表方式无效
348
475
  """
349
476
  try:
350
477
  date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d %H:%M:%S')
@@ -352,92 +479,141 @@ class MySQLUploader:
352
479
  try:
353
480
  date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d')
354
481
  except ValueError:
355
- raise ValueError(f"Invalid date format: {date_value}")
482
+ error_msg = f"无效的日期格式: {date_value}"
483
+ if self.logger:
484
+ self.logger.error("无效的日期格式: %s", date_value)
485
+ raise ValueError(error_msg)
356
486
 
357
487
  if partition_by == 'year':
358
488
  return f"{table_name}_{date_obj.year}"
359
489
  elif partition_by == 'month':
360
490
  return f"{table_name}_{date_obj.year}_{date_obj.month:02d}"
361
491
  else:
362
- raise ValueError("partition_by must be 'year' or 'month'")
492
+ error_msg = "partition_by must be 'year' or 'month'"
493
+ if self.logger:
494
+ self.logger.error(error_msg)
495
+ raise ValueError(error_msg)
363
496
 
364
- def _insert_data(
497
+ def _create_table(
365
498
  self,
366
499
  db_name: str,
367
500
  table_name: str,
368
- data: List[Dict],
369
501
  columns: Dict[str, str],
370
- check_duplicate: bool = False,
371
- duplicate_columns: Optional[List[str]] = None,
372
- batch_size: int = 1000
502
+ primary_keys: Optional[List[str]] = None,
503
+ date_column: Optional[str] = None,
504
+ indexes: Optional[List[str]] = None,
505
+ unique_columns: Optional[List[str]] = None
373
506
  ):
374
507
  """
375
- 插入数据到表中
508
+ 创建数据表
376
509
 
377
510
  :param db_name: 数据库名
378
511
  :param table_name: 表名
379
- :param data: 要插入的数据
380
- :param columns: 列名和数据类型字典
381
- :param check_duplicate: 是否检查重复
382
- :param duplicate_columns: 用于检查重复的列列表
383
- :param batch_size: 批量插入的大小
512
+ :param columns: 列名和数据类型字典 {列名: 数据类型}
513
+ :param primary_keys: 主键列列表
514
+ :param date_column: 日期列名,如果存在将设置为索引
515
+ :param indexes: 需要创建索引的列列表
516
+ :param unique_columns: 需要创建唯一索引的列列表
384
517
  """
385
518
  db_name = self._validate_identifier(db_name)
386
519
  table_name = self._validate_identifier(table_name)
387
520
 
388
- if not data:
389
- return
521
+ if not columns:
522
+ error_msg = "No columns specified for table creation"
523
+ if self.logger:
524
+ self.logger.error(error_msg)
525
+ raise ValueError(error_msg)
390
526
 
391
- # 获取所有列名
392
- all_columns = list(columns.keys())
393
- safe_columns = [self._validate_identifier(col) for col in all_columns]
394
- placeholders = ','.join(['%s'] * len(safe_columns))
527
+ # 构建列定义SQL
528
+ column_defs = ["`id` INT NOT NULL AUTO_INCREMENT"]
395
529
 
396
- # 构建INSERT SQL
397
- if check_duplicate:
398
- if duplicate_columns:
399
- # 只检查指定列的重复
400
- dup_columns = [self._validate_identifier(col) for col in duplicate_columns]
401
- else:
402
- # 检查所有列的重复
403
- dup_columns = safe_columns
530
+ # 添加其他列定义
531
+ for col_name, col_type in columns.items():
532
+ # 跳过id列,因为已经在前面添加了
533
+ if col_name.lower() == 'id':
534
+ continue
535
+ safe_col_name = self._validate_identifier(col_name)
536
+ col_def = f"`{safe_col_name}` {col_type}"
404
537
 
405
- # 构建ON DUPLICATE KEY UPDATE子句
406
- update_clause = ','.join([f"`{col}`=VALUES(`{col}`)" for col in safe_columns])
538
+ # 添加NOT NULL约束
539
+ if not col_type.lower().startswith('json'):
540
+ col_def += " NOT NULL"
407
541
 
408
- sql = f"""
409
- INSERT INTO `{db_name}`.`{table_name}`
410
- (`{'`,`'.join(safe_columns)}`)
411
- VALUES ({placeholders})
412
- ON DUPLICATE KEY UPDATE {update_clause}
413
- """
542
+ column_defs.append(col_def)
543
+
544
+ # 添加主键定义
545
+ if primary_keys:
546
+ # 确保id在主键中
547
+ if 'id' not in [pk.lower() for pk in primary_keys]:
548
+ primary_keys = ['id'] + primary_keys
414
549
  else:
415
- sql = f"""
416
- INSERT INTO `{db_name}`.`{table_name}`
417
- (`{'`,`'.join(safe_columns)}`)
418
- VALUES ({placeholders})
419
- """
550
+ # 如果没有指定主键,则使用id作为主键
551
+ primary_keys = ['id']
420
552
 
421
- # 分批插入数据
422
- with self._get_connection() as conn:
423
- with conn.cursor() as cursor:
424
- for i in range(0, len(data), batch_size):
425
- batch = data[i:i + batch_size]
426
- # 准备批量数据
427
- values = []
428
- for row in batch:
429
- row_values = []
430
- for col in all_columns:
431
- row_values.append(row.get(col))
432
- values.append(row_values)
553
+ # 添加主键定义
554
+ safe_primary_keys = [self._validate_identifier(pk) for pk in primary_keys]
555
+ primary_key_sql = f", PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
433
556
 
434
- # 执行批量插入
435
- try:
436
- cursor.executemany(sql, values)
437
- conn.commit()
438
- except Exception as e:
439
- conn.rollback()
440
- raise e
557
+ # 添加唯一索引定义
558
+ unique_index_sql = ""
559
+ if unique_columns:
560
+ for col in unique_columns:
561
+ if col.lower() != 'id' and col in columns:
562
+ safe_col = self._validate_identifier(col)
563
+ unique_index_sql += f", UNIQUE KEY `uk_{safe_col}` (`{safe_col}`)"
564
+
565
+ # 构建完整SQL
566
+ sql = f"""
567
+ CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
568
+ {','.join(column_defs)}
569
+ {primary_key_sql}
570
+ {unique_index_sql}
571
+ ) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
572
+ """
573
+
574
+ try:
575
+ with self._get_connection() as conn:
576
+ with conn.cursor() as cursor:
577
+ cursor.execute(sql)
578
+ if self.logger:
579
+ self.logger.info("表 %s.%s 创建成功", db_name, table_name)
580
+
581
+ # 添加普通索引
582
+ index_statements = []
583
+
584
+ # 日期列索引
585
+ if date_column and date_column in columns:
586
+ safe_date_col = self._validate_identifier(date_column)
587
+ index_statements.append(
588
+ f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_date_col}` (`{safe_date_col}`)"
589
+ )
590
+
591
+ # 其他索引
592
+ if indexes:
593
+ for idx_col in indexes:
594
+ if idx_col in columns:
595
+ safe_idx_col = self._validate_identifier(idx_col)
596
+ index_statements.append(
597
+ f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)"
598
+ )
599
+
600
+ # 执行所有索引创建语句
601
+ if index_statements:
602
+ with conn.cursor() as cursor:
603
+ for stmt in index_statements:
604
+ cursor.execute(stmt)
605
+ if self.logger:
606
+ self.logger.debug("Executed index statement: %s", stmt)
607
+
608
+ conn.commit()
609
+ if self.logger:
610
+ self.logger.info("All indexes created successfully for %s.%s", db_name, table_name)
611
+
612
+ except Exception as e:
613
+ if self.logger:
614
+ self.logger.error("创建表 %s.%s 失败: %s", db_name, table_name, str(e))
615
+ conn.rollback()
616
+ raise
441
617
 
442
618
  def upload_data(
443
619
  self,
@@ -451,7 +627,9 @@ class MySQLUploader:
451
627
  allow_null: bool = False,
452
628
  partition_by: Optional[str] = None,
453
629
  partition_date_column: str = '日期',
454
- auto_create: bool = True
630
+ auto_create: bool = True,
631
+ replace: bool = False,
632
+ indexes: Optional[List[str]] = None
455
633
  ):
456
634
  """
457
635
  上传数据到数据库
@@ -465,15 +643,30 @@ class MySQLUploader:
465
643
  :param duplicate_columns: 用于检查重复的列列表,如果不指定则使用所有列
466
644
  :param allow_null: 是否允许空值,默认为False
467
645
  :param partition_by: 分表方式 ('year' 或 'month'),默认为None不分表
468
- :param partition_date_column: 用于分表的日期列名,默认为'日期'
646
+ :param partition_date_column: 用于分表的日期列名,默认为'date'
469
647
  :param auto_create: 是否自动创建不存在的数据库或表,默认为True
648
+ :param replace: 是否使用REPLACE代替INSERT,默认为False
649
+ :param indexes: 需要创建索引的列列表
650
+ :raises ValueError: 如果参数无效或操作失败
470
651
  """
652
+ if self.logger:
653
+ self.logger.info(
654
+ "开始上传数据到 %s.%s (分表方式=%s, 替换模式=%s)",
655
+ db_name, table_name, partition_by, replace
656
+ )
657
+
471
658
  # 验证参数
472
659
  if not columns:
473
- raise ValueError("Columns specification is required")
660
+ error_msg = "Columns specification is required"
661
+ if self.logger:
662
+ self.logger.error(error_msg)
663
+ raise ValueError(error_msg)
474
664
 
475
665
  if partition_by and partition_by not in ['year', 'month']:
476
- raise ValueError("partition_by must be 'year', 'month' or None")
666
+ error_msg = "分表方式必须是 'year' 'month'"
667
+ if self.logger:
668
+ self.logger.error(error_msg)
669
+ raise ValueError(error_msg)
477
670
 
478
671
  # 准备数据
479
672
  prepared_data = self._prepare_data(data, columns, allow_null)
@@ -483,7 +676,16 @@ class MySQLUploader:
483
676
  if auto_create:
484
677
  self._create_database(db_name)
485
678
  else:
486
- raise ValueError(f"Database '{db_name}' does not exist")
679
+ error_msg = f"Database '{db_name}' does not exist"
680
+ if self.logger:
681
+ self.logger.error(error_msg)
682
+ raise ValueError(error_msg)
683
+
684
+ # 确定唯一索引列
685
+ unique_columns = None
686
+ if check_duplicate:
687
+ unique_columns = duplicate_columns if duplicate_columns else [col for col in columns.keys() if
688
+ col.lower() != 'id']
487
689
 
488
690
  # 处理分表逻辑
489
691
  if partition_by:
@@ -491,7 +693,10 @@ class MySQLUploader:
491
693
  partitioned_data = {}
492
694
  for row in prepared_data:
493
695
  if partition_date_column not in row:
494
- raise ValueError(f"Partition date column '{partition_date_column}' not found in data")
696
+ error_msg = f"异常缺失列 '{partition_date_column}'"
697
+ if self.logger:
698
+ self.logger.error(error_msg)
699
+ raise ValueError(error_msg)
495
700
  part_table = self._get_partition_table_name(table_name, str(row[partition_date_column]), partition_by)
496
701
  if part_table not in partitioned_data:
497
702
  partitioned_data[part_table] = []
@@ -502,14 +707,22 @@ class MySQLUploader:
502
707
  self._upload_to_table(
503
708
  db_name, part_table, part_data, columns,
504
709
  primary_keys, check_duplicate, duplicate_columns,
505
- allow_null, auto_create, partition_date_column
710
+ allow_null, auto_create, partition_date_column,
711
+ replace, indexes, unique_columns
506
712
  )
507
713
  else:
508
714
  # 不分表,直接上传
509
715
  self._upload_to_table(
510
716
  db_name, table_name, prepared_data, columns,
511
717
  primary_keys, check_duplicate, duplicate_columns,
512
- allow_null, auto_create, partition_date_column
718
+ allow_null, auto_create, partition_date_column,
719
+ replace, indexes, unique_columns
720
+ )
721
+
722
+ if self.logger:
723
+ self.logger.info(
724
+ "成功上传 %d 行数据到 %s.%s",
725
+ len(prepared_data), db_name, table_name
513
726
  )
514
727
 
515
728
  def _upload_to_table(
@@ -523,41 +736,178 @@ class MySQLUploader:
523
736
  duplicate_columns: Optional[List[str]],
524
737
  allow_null: bool,
525
738
  auto_create: bool,
526
- date_column: Optional[str]
739
+ date_column: Optional[str],
740
+ replace: bool,
741
+ indexes: Optional[List[str]],
742
+ unique_columns: Optional[List[str]] = None
527
743
  ):
528
744
  """实际执行表上传的内部方法"""
529
745
  # 检查表是否存在
530
746
  if not self._check_table_exists(db_name, table_name):
531
747
  if auto_create:
532
- self._create_table(db_name, table_name, columns, primary_keys, date_column)
748
+ self._create_table(db_name, table_name, columns, primary_keys, date_column, indexes, unique_columns)
533
749
  else:
534
- raise ValueError(f"Table '{db_name}.{table_name}' does not exist")
750
+ error_msg = f"Table '{db_name}.{table_name}' does not exist"
751
+ if self.logger:
752
+ self.logger.error(error_msg)
753
+ raise ValueError(error_msg)
535
754
 
536
755
  # 获取表结构并验证
537
756
  table_columns = self._get_table_columns(db_name, table_name)
538
757
  if not table_columns:
539
- raise ValueError(f"Failed to get columns for table '{db_name}.{table_name}'")
758
+ error_msg = f"Failed to get columns for table '{db_name}.{table_name}'"
759
+ if self.logger:
760
+ self.logger.error(error_msg)
761
+ raise ValueError(error_msg)
540
762
 
541
763
  # 验证数据列与表列匹配
542
764
  for col in columns:
543
765
  if col not in table_columns:
544
- raise ValueError(f"Column '{col}' not found in table '{db_name}.{table_name}'")
766
+ error_msg = f"Column '{col}' not found in table '{db_name}.{table_name}'"
767
+ if self.logger:
768
+ self.logger.error(error_msg)
769
+ raise ValueError(error_msg)
545
770
 
546
771
  # 插入数据
547
772
  self._insert_data(
548
773
  db_name, table_name, data, columns,
549
- check_duplicate, duplicate_columns
774
+ check_duplicate, duplicate_columns,
775
+ replace=replace
550
776
  )
551
777
 
778
+ def _insert_data(
779
+ self,
780
+ db_name: str,
781
+ table_name: str,
782
+ data: List[Dict],
783
+ columns: Dict[str, str],
784
+ check_duplicate: bool = False,
785
+ duplicate_columns: Optional[List[str]] = None,
786
+ batch_size: int = 1000,
787
+ replace: bool = False
788
+ ):
789
+ """
790
+ 插入数据到表中
791
+
792
+ :param db_name: 数据库名
793
+ :param table_name: 表名
794
+ :param data: 要插入的数据
795
+ :param columns: 列名和数据类型字典
796
+ :param check_duplicate: 是否检查重复
797
+ :param duplicate_columns: 用于检查重复的列列表
798
+ :param batch_size: 批量插入的大小
799
+ :param replace: 是否使用REPLACE代替INSERT
800
+ :raises Exception: 如果插入失败
801
+ """
802
+ db_name = self._validate_identifier(db_name)
803
+ table_name = self._validate_identifier(table_name)
804
+
805
+ if not data:
806
+ if self.logger:
807
+ self.logger.warning("No data to insert into %s.%s", db_name, table_name)
808
+ return
809
+
810
+ # 获取所有列名
811
+ all_columns = [col for col in columns.keys() if col.lower() != 'id']
812
+ safe_columns = [self._validate_identifier(col) for col in all_columns]
813
+ placeholders = ','.join(['%s'] * len(safe_columns))
814
+
815
+ # 构建SQL语句
816
+ operation = "REPLACE" if replace else "INSERT IGNORE" if check_duplicate else "INSERT"
817
+
818
+ if check_duplicate and not replace:
819
+ # 当check_duplicate=True时,使用INSERT IGNORE来跳过重复记录
820
+ sql = f"""
821
+ {operation} INTO `{db_name}`.`{table_name}`
822
+ (`{'`,`'.join(safe_columns)}`)
823
+ VALUES ({placeholders})
824
+ """
825
+ else:
826
+ sql = f"""
827
+ {operation} INTO `{db_name}`.`{table_name}`
828
+ (`{'`,`'.join(safe_columns)}`)
829
+ VALUES ({placeholders})
830
+ """
831
+
832
+ if len(self._prepared_statements) >= self._max_cached_statements:
833
+ # 移除最旧的缓存
834
+ oldest_key = next(iter(self._prepared_statements))
835
+ del self._prepared_statements[oldest_key]
836
+
837
+ # 缓存预处理语句
838
+ cache_key = f"{db_name}.{table_name}.{operation}.{check_duplicate}"
839
+ if cache_key not in self._prepared_statements:
840
+ self._prepared_statements[cache_key] = sql
841
+ if self.logger:
842
+ self.logger.debug("已缓存预处理语句: %s", cache_key)
843
+
844
+ # 分批插入数据
845
+ with self._get_connection() as conn:
846
+ with conn.cursor() as cursor:
847
+ for i in range(0, len(data), batch_size):
848
+ batch = data[i:i + batch_size]
849
+ # 准备批量数据
850
+ values = []
851
+ for row in batch:
852
+ row_values = []
853
+ for col in all_columns:
854
+ row_values.append(row.get(col))
855
+ values.append(row_values)
856
+
857
+ # 执行批量插入
858
+ try:
859
+ start_time = time.time()
860
+ cursor.executemany(sql, values)
861
+ conn.commit() # 每个批次提交一次
862
+ if self.logger:
863
+ self.logger.debug(
864
+ "成功插入批次 %d-%d/%d 到 %s.%s, 耗时 %.2f 秒",
865
+ i + 1, min(i + batch_size, len(data)), len(data),
866
+ db_name, table_name, time.time() - start_time
867
+ )
868
+ except Exception as e:
869
+ conn.rollback()
870
+ error_msg = f"Failed to insert batch {i + 1}-{min(i + batch_size, len(data))}/{len(data)} into {db_name}.{table_name}: {str(e)}"
871
+ if self.logger:
872
+ self.logger.error(error_msg)
873
+ raise Exception(error_msg)
874
+
552
875
  def close(self):
553
876
  """关闭连接池"""
554
- self.pool.close()
877
+ if hasattr(self, 'pool') and self.pool:
878
+ try:
879
+ # 先关闭所有连接
880
+ while True:
881
+ conn = getattr(self.pool, '_connections', None)
882
+ if not conn or not conn.queue:
883
+ break
884
+ try:
885
+ conn = self.pool.connection()
886
+ conn.close()
887
+ except:
888
+ pass
889
+
890
+ # 然后关闭连接池
891
+ self.pool.close()
892
+ if self.logger:
893
+ self.logger.info("连接池已成功关闭")
894
+ except Exception as e:
895
+ if self.logger:
896
+ self.logger.error("关闭连接池失败: %s", str(e))
897
+ raise
898
+ self.pool = None
555
899
 
556
900
  def __enter__(self):
557
901
  return self
558
902
 
559
903
  def __exit__(self, exc_type, exc_val, exc_tb):
560
904
  self.close()
905
+ if exc_type is not None and self.logger:
906
+ self.logger.error(
907
+ "Exception occurred: %s: %s",
908
+ exc_type.__name__, str(exc_val),
909
+ exc_info=(exc_type, exc_val, exc_tb)
910
+ )
561
911
 
562
912
 
563
913
  class MysqlUpload:
@@ -1624,3 +1974,46 @@ class OptimizeDatas:
1624
1974
 
1625
1975
  if __name__ == '__main__':
1626
1976
  pass
1977
+
1978
+ # 初始化上传器
1979
+ uploader = MySQLUploader(
1980
+ username='root',
1981
+ password='1',
1982
+ host='localhost',
1983
+ port=3306,
1984
+ enable_logging=True,
1985
+ log_level='INFO'
1986
+ )
1987
+
1988
+ # 定义列和数据类型
1989
+ columns = {
1990
+ 'id': 'INT',
1991
+ 'name': 'VARCHAR(255)',
1992
+ 'age': 'INT',
1993
+ 'salary': 'DECIMAL(10,2)',
1994
+ '日期': 'DATE'
1995
+ }
1996
+
1997
+ # 准备数据
1998
+ data = [
1999
+ {'name': 'Alice', 'age': 30, 'salary': 50000.50, '日期': '2023-01-15'},
2000
+ {'name': 'Bob', 'age': 25, 'salary': 45000.75, '日期': '2023-02-20'},
2001
+ {'name': 'Charlie', 'age': 35, 'salary': 60000.00, '日期': '2023-01-10'}
2002
+ ]
2003
+
2004
+ # 上传数据
2005
+ uploader.upload_data(
2006
+ db_name='test_db',
2007
+ table_name='employees',
2008
+ data=data,
2009
+ columns=columns,
2010
+ primary_keys=[],
2011
+ check_duplicate=True,
2012
+ replace=True,
2013
+ duplicate_columns=['name'],
2014
+ allow_null=False,
2015
+ partition_by='month' # 按月分表
2016
+ )
2017
+
2018
+ # 关闭上传器
2019
+ uploader.close()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 3.9.0
3
+ Version: 3.9.1
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,5 +1,5 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=7hLUrBXQAGj08UbPoot9b_BwLXRkc-RH_nJSvG9AqTc,17
2
+ mdbq/__version__.py,sha256=LC0UP2VyG12RJ8LWMMTlZDfZCrzzKDWmgNe41Gr89BE,17
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
4
  mdbq/aggregation/optimize.py,sha256=2oalzD9weZhDclUC22OLxYa8Zj7KnmsGUoUau_Jlyc4,19796
5
5
  mdbq/aggregation/query_data.py,sha256=5_OzjGR5Sq00q-EgAYmSE5V9i4Solw9y4hkldl4mvt8,179808
@@ -8,7 +8,7 @@ mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
8
8
  mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
9
9
  mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
10
10
  mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
11
- mdbq/mysql/mysql.py,sha256=L_UR7TqcZoHZj6dWVZe-ai6X2yc_oULPyUzKy7DHbOw,74493
11
+ mdbq/mysql/mysql.py,sha256=2xjf7j-6PSAmxdubYjwkh71n0Rhdum-VCkvTmQN-V3U,91100
12
12
  mdbq/mysql/s_query.py,sha256=X055aLRAgxVvueXx4NbfNjp6MyBI02_XBb1pTKw09L0,8660
13
13
  mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
14
14
  mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
@@ -22,7 +22,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
22
22
  mdbq/redis/getredis.py,sha256=Uk8-cOWT0JU1qRyIVqdbYokSLvkDIAfcokmYj1ebw8k,24104
23
23
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
24
24
  mdbq/spider/aikucun.py,sha256=OhyEv1VyAKTOHjLDM37iNDQeRg5OnrNoKODoG2VxHes,19806
25
- mdbq-3.9.0.dist-info/METADATA,sha256=pd--meyNjH8KaX-ZgnSHqWN9GbEyx59Atb1Wgs6BqVc,363
26
- mdbq-3.9.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
27
- mdbq-3.9.0.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
28
- mdbq-3.9.0.dist-info/RECORD,,
25
+ mdbq-3.9.1.dist-info/METADATA,sha256=2J1EsfDaQ5gl0dAjFvNDwF2U0xB1R9jmmICm9-XwH7s,363
26
+ mdbq-3.9.1.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
27
+ mdbq-3.9.1.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
28
+ mdbq-3.9.1.dist-info/RECORD,,
File without changes