mdbq 3.8.19__py3-none-any.whl → 3.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.8.19'
1
+ VERSION = '3.9.1'
mdbq/mysql/mysql.py CHANGED
@@ -11,7 +11,8 @@ from sqlalchemy import create_engine
11
11
  import os
12
12
  import logging
13
13
  from mdbq.other import otk
14
-
14
+ from dbutils.pooled_db import PooledDB
15
+ from typing import Union, List, Dict, Optional, Any, Tuple
15
16
  warnings.filterwarnings('ignore')
16
17
  """
17
18
  建表流程:
@@ -43,6 +44,872 @@ def count_decimal_places(num_str):
43
44
  return 0, 0
44
45
 
45
46
 
47
+ class MySQLUploader:
48
+ def __init__(
49
+ self,
50
+ username: str,
51
+ password: str,
52
+ host: str = 'localhost',
53
+ port: int = 3306,
54
+ charset: str = 'utf8mb4',
55
+ collation: str = 'utf8mb4_0900_ai_ci',
56
+ enable_logging: bool = False,
57
+ log_level: str = 'ERROR',
58
+ max_retries: int = 10,
59
+ retry_interval: int = 10,
60
+ pool_size: int = 5,
61
+ connect_timeout: int = 10,
62
+ read_timeout: int = 30,
63
+ write_timeout: int = 30,
64
+ ssl: Optional[Dict] = None
65
+ ):
66
+ """
67
+ 初始化MySQL上传工具
68
+
69
+ :param username: 数据库用户名
70
+ :param password: 数据库密码
71
+ :param host: 数据库主机地址,默认为localhost
72
+ :param port: 数据库端口,默认为3306
73
+ :param charset: 字符集,默认为utf8mb4
74
+ :param collation: 排序规则,默认为utf8mb4_0900_ai_ci
75
+ :param enable_logging: 是否启用日志,默认为False
76
+ :param log_level: 日志级别,默认为ERROR
77
+ :param max_retries: 最大重试次数,默认为10
78
+ :param retry_interval: 重试间隔(秒),默认为10
79
+ :param pool_size: 连接池大小,默认为5
80
+ :param connect_timeout: 连接超时(秒),默认为10
81
+ :param read_timeout: 读取超时(秒),默认为30
82
+ :param write_timeout: 写入超时(秒),默认为30
83
+ :param ssl: SSL配置字典,默认为None
84
+ """
85
+ self.username = username
86
+ self.password = password
87
+ self.host = host
88
+ self.port = port
89
+ self.charset = charset
90
+ self.collation = collation
91
+ self.max_retries = max(max_retries, 1) # 至少重试1次
92
+ self.retry_interval = max(retry_interval, 1) # 至少间隔1秒
93
+ self.pool_size = max(pool_size, 1) # 至少1个连接
94
+ self.connect_timeout = connect_timeout
95
+ self.read_timeout = read_timeout
96
+ self.write_timeout = write_timeout
97
+ self.ssl = ssl
98
+ self._prepared_statements = {} # 预处理语句缓存
99
+ self._max_cached_statements = 100 # 最大缓存语句数
100
+
101
+ # 初始化日志
102
+ if enable_logging:
103
+ self._init_logging(log_level)
104
+ else:
105
+ self.logger = None
106
+
107
+ # 创建连接池
108
+ self.pool = self._create_connection_pool()
109
+
110
+ def _init_logging(self, log_level: str):
111
+ """初始化日志配置"""
112
+ valid_levels = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
113
+ level = log_level.upper() if log_level.upper() in valid_levels else 'ERROR'
114
+
115
+ logging.basicConfig(
116
+ level=getattr(logging, level),
117
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
118
+ handlers=[logging.StreamHandler()]
119
+ )
120
+ self.logger = logging.getLogger('MySQLUploader')
121
+
122
+ def _create_connection_pool(self) -> PooledDB:
123
+ """创建数据库连接池"""
124
+ pool_params = {
125
+ 'creator': pymysql,
126
+ 'host': self.host,
127
+ 'port': self.port,
128
+ 'user': self.username,
129
+ 'password': self.password,
130
+ 'charset': self.charset,
131
+ 'cursorclass': pymysql.cursors.DictCursor,
132
+ 'maxconnections': self.pool_size,
133
+ 'ping': 7, # 连接检查
134
+ 'connect_timeout': self.connect_timeout,
135
+ 'read_timeout': self.read_timeout,
136
+ 'write_timeout': self.write_timeout,
137
+ 'autocommit': False
138
+ }
139
+
140
+ if self.ssl:
141
+ required_keys = {'ca', 'cert', 'key'}
142
+ if not all(k in self.ssl for k in required_keys):
143
+ raise ValueError("SSL配置必须包含ca、cert和key")
144
+ pool_params['ssl'] = {
145
+ 'ca': self.ssl['ca'],
146
+ 'cert': self.ssl['cert'],
147
+ 'key': self.ssl['key'],
148
+ 'check_hostname': self.ssl.get('check_hostname', False)
149
+ }
150
+
151
+ try:
152
+ pool = PooledDB(**pool_params)
153
+ return pool
154
+ except Exception as e:
155
+ if self.logger:
156
+ self.logger.error("连接池创建失败: %s", str(e))
157
+ raise ConnectionError(f"连接池创建失败: {str(e)}")
158
+
159
+ def _validate_datetime(self, value):
160
+ formats = [
161
+ '%Y-%m-%d %H:%M:%S',
162
+ '%Y-%m-%d',
163
+ '%Y/%m/%d %H:%M:%S',
164
+ '%Y/%m/%d',
165
+ '%Y%m%d',
166
+ '%Y-%m-%dT%H:%M:%S', # ISO格式
167
+ '%Y-%m-%d %H:%M:%S.%f' # 带毫秒
168
+ ]
169
+ for fmt in formats:
170
+ try:
171
+ return datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d %H:%M:%S')
172
+ except ValueError:
173
+ continue
174
+ raise ValueError(f"无效的日期格式: {value}")
175
+
176
+ def _validate_identifier(self, identifier: str) -> str:
177
+ """
178
+ 验证并清理数据库标识符(数据库名、表名、列名)
179
+ 防止SQL注入和非法字符
180
+
181
+ :param identifier: 要验证的标识符
182
+ :return: 清理后的安全标识符
183
+ :raises ValueError: 如果标识符无效
184
+ """
185
+ if not identifier or not isinstance(identifier, str):
186
+ error_msg = f"无效的标识符: {identifier}"
187
+ if self.logger:
188
+ self.logger.error(error_msg)
189
+ raise ValueError(error_msg)
190
+
191
+ # 移除可能有害的字符,只保留字母、数字、下划线和美元符号
192
+ cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '', identifier)
193
+ if not cleaned:
194
+ error_msg = f"无法清理异常标识符: {identifier}"
195
+ if self.logger:
196
+ self.logger.error(error_msg)
197
+ raise ValueError(error_msg)
198
+
199
+ # 检查是否为MySQL保留字
200
+ mysql_keywords = {
201
+ 'select', 'insert', 'update', 'delete', 'from', 'where', 'and', 'or',
202
+ 'not', 'like', 'in', 'is', 'null', 'true', 'false', 'between'
203
+ }
204
+ if cleaned.lower() in mysql_keywords:
205
+ if self.logger:
206
+ self.logger.warning("存在MySQL保留字: %s", cleaned)
207
+ return f"`{cleaned}`"
208
+
209
+ return cleaned
210
+
211
+ def _validate_value(self, value: Any, column_type: str) -> Any:
212
+ """
213
+ 验证并清理数据值,根据列类型进行适当转换
214
+
215
+ :param value: 要验证的值
216
+ :param column_type: 列的数据类型
217
+ :return: 清理后的值
218
+ :raises ValueError: 如果值转换失败
219
+ """
220
+ if value is None:
221
+ return None
222
+
223
+ try:
224
+ column_type_lower = column_type.lower()
225
+
226
+ if 'int' in column_type_lower:
227
+ return int(value) if value is not None else None
228
+ elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
229
+ return float(value) if value is not None else None
230
+ elif '日期' in column_type_lower or 'time' in column_type_lower:
231
+ if isinstance(value, (datetime.datetime, pd.Timestamp)):
232
+ return value.strftime('%Y-%m-%d %H:%M:%S')
233
+ elif isinstance(value, str):
234
+ try:
235
+ return self._validate_datetime(value) # 使用专门的日期验证方法
236
+ except ValueError as e:
237
+ raise ValueError(f"无效日期格式: {value} - {str(e)}")
238
+ return str(value)
239
+ elif 'char' in column_type_lower or 'text' in column_type_lower:
240
+ # 防止SQL注入
241
+ if isinstance(value, str):
242
+ return value.replace('\\', '\\\\').replace("'", "\\'")
243
+ return str(value)
244
+ elif 'json' in column_type_lower:
245
+ import json
246
+ return json.dumps(value) if value is not None else None
247
+ else:
248
+ return value
249
+ except (ValueError, TypeError) as e:
250
+ error_msg = f"数据类型转换异常 {value} to type {column_type}: {str(e)}"
251
+ if self.logger:
252
+ self.logger.error(error_msg)
253
+ raise ValueError(error_msg)
254
+
255
+ def _execute_with_retry(self, func, *args, **kwargs):
256
+ """
257
+ 带重试机制的SQL执行装饰器
258
+
259
+ :param func: 要执行的函数
260
+ :param args: 位置参数
261
+ :param kwargs: 关键字参数
262
+ :return: 函数执行结果
263
+ :raises Exception: 如果所有重试都失败
264
+ """
265
+
266
+ @wraps(func)
267
+ def wrapper(*args, **kwargs):
268
+ last_exception = None
269
+ for attempt in range(self.max_retries):
270
+ try:
271
+ result = func(*args, **kwargs)
272
+ if attempt > 0 and self.logger:
273
+ self.logger.info("Operation succeeded after %d retries", attempt)
274
+ return result
275
+ except (pymysql.OperationalError, pymysql.err.MySQLError) as e:
276
+ last_exception = e
277
+ if attempt < self.max_retries - 1:
278
+ wait_time = self.retry_interval * (attempt + 1)
279
+ if self.logger:
280
+ self.logger.warning(
281
+ "尝试 %d/%d 失败: %s. %d秒后重试...",
282
+ attempt + 1, self.max_retries, str(e), wait_time
283
+ )
284
+ time.sleep(wait_time)
285
+ # 尝试重新连接
286
+ try:
287
+ self.pool = self._create_connection_pool()
288
+ except Exception as reconnect_error:
289
+ if self.logger:
290
+ self.logger.error("重连失败: %s", str(reconnect_error))
291
+ continue
292
+ else:
293
+ if self.logger:
294
+ self.logger.error(
295
+ "Operation failed after %d attempts. Last error: %s",
296
+ self.max_retries, str(e)
297
+ )
298
+ except pymysql.IntegrityError as e:
299
+ # 完整性错误通常不需要重试
300
+ if self.logger:
301
+ self.logger.error("完整性约束错误: %s", str(e))
302
+ raise e
303
+ except Exception as e:
304
+ last_exception = e
305
+ if self.logger:
306
+ self.logger.error("发生意外错误: %s", str(e))
307
+ break
308
+
309
+ raise last_exception if last_exception else Exception("发生未知错误")
310
+
311
+ return wrapper(*args, **kwargs)
312
+
313
+ def _get_connection(self):
314
+ """从连接池获取连接"""
315
+ try:
316
+ conn = self.pool.connection()
317
+ if self.logger:
318
+ self.logger.debug("成功获取数据库连接")
319
+ return conn
320
+ except Exception as e:
321
+ if self.logger:
322
+ self.logger.error("连接数据库失败: %s", str(e))
323
+ raise ConnectionError(f"连接数据库失败: {str(e)}")
324
+
325
+ def _check_database_exists(self, db_name: str) -> bool:
326
+ """检查数据库是否存在"""
327
+ db_name = self._validate_identifier(db_name)
328
+ sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
329
+
330
+ try:
331
+ with self._get_connection() as conn:
332
+ with conn.cursor() as cursor:
333
+ cursor.execute(sql, (db_name,))
334
+ exists = bool(cursor.fetchone())
335
+ if self.logger:
336
+ self.logger.debug("数据库 %s 已存在: %s", db_name, exists)
337
+ return exists
338
+ except Exception as e:
339
+ if self.logger:
340
+ self.logger.error("检查数据库是否存在时出错: %s", str(e))
341
+ raise
342
+
343
+ def _create_database(self, db_name: str):
344
+ """创建数据库"""
345
+ db_name = self._validate_identifier(db_name)
346
+ sql = f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET {self.charset} COLLATE {self.collation}"
347
+
348
+ try:
349
+ with self._get_connection() as conn:
350
+ with conn.cursor() as cursor:
351
+ cursor.execute(sql)
352
+ conn.commit()
353
+ if self.logger:
354
+ self.logger.info("数据库 %s 创建成功", db_name)
355
+ except Exception as e:
356
+ if self.logger:
357
+ self.logger.error("无法创建数据库 %s: %s", db_name, str(e))
358
+ conn.rollback()
359
+ raise
360
+
361
+ def _check_table_exists(self, db_name: str, table_name: str) -> bool:
362
+ """检查表是否存在"""
363
+ db_name = self._validate_identifier(db_name)
364
+ table_name = self._validate_identifier(table_name)
365
+ sql = """
366
+ SELECT TABLE_NAME
367
+ FROM INFORMATION_SCHEMA.TABLES
368
+ WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
369
+ """
370
+
371
+ try:
372
+ with self._get_connection() as conn:
373
+ with conn.cursor() as cursor:
374
+ cursor.execute(sql, (db_name, table_name))
375
+ exists = bool(cursor.fetchone())
376
+ return exists
377
+ except Exception as e:
378
+ if self.logger:
379
+ self.logger.error("检查数据表是否存在时发生未知错误: %s", str(e))
380
+ raise
381
+
382
+ def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
383
+ """获取表的列名和数据类型"""
384
+ db_name = self._validate_identifier(db_name)
385
+ table_name = self._validate_identifier(table_name)
386
+ sql = """
387
+ SELECT COLUMN_NAME, DATA_TYPE
388
+ FROM INFORMATION_SCHEMA.COLUMNS
389
+ WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
390
+ ORDER BY ORDINAL_POSITION
391
+ """
392
+
393
+ try:
394
+ with self._get_connection() as conn:
395
+ with conn.cursor() as cursor:
396
+ cursor.execute(sql, (db_name, table_name))
397
+ columns = {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
398
+ if self.logger:
399
+ self.logger.debug("获取表 %s.%s 的列信息: %s", db_name, table_name, columns)
400
+ return columns
401
+ except Exception as e:
402
+ if self.logger:
403
+ self.logger.error("无法获取表列信息: %s", str(e))
404
+ raise
405
+
406
+ def _prepare_data(
407
+ self,
408
+ data: Union[Dict, List[Dict], pd.DataFrame],
409
+ columns: Dict[str, str],
410
+ allow_null: bool = False
411
+ ) -> List[Dict]:
412
+ """
413
+ 准备要上传的数据,验证并转换数据类型
414
+
415
+ :param data: 输入数据
416
+ :param columns: 列名和数据类型字典 {列名: 数据类型}
417
+ :param allow_null: 是否允许空值
418
+ :return: 准备好的数据列表
419
+ :raises ValueError: 如果数据验证失败
420
+ """
421
+ # 统一数据格式为字典列表
422
+ if isinstance(data, pd.DataFrame):
423
+ try:
424
+ data = data.replace({pd.NA: None}).to_dict('records')
425
+ except Exception as e:
426
+ if self.logger:
427
+ self.logger.error("Failed to convert DataFrame to dict: %s", str(e))
428
+ raise ValueError(f"Failed to convert DataFrame to dict: {str(e)}")
429
+ elif isinstance(data, dict):
430
+ data = [data]
431
+ elif not isinstance(data, list) or not all(isinstance(item, dict) for item in data):
432
+ error_msg = "Data must be a dict, list of dicts, or DataFrame"
433
+ if self.logger:
434
+ self.logger.error(error_msg)
435
+ raise ValueError(error_msg)
436
+
437
+ prepared_data = []
438
+ for row_idx, row in enumerate(data, 1):
439
+ prepared_row = {}
440
+ for col_name, col_type in columns.items():
441
+ # 跳过id列,不允许外部传入id
442
+ if col_name.lower() == 'id':
443
+ continue
444
+
445
+ if col_name not in row:
446
+ if not allow_null:
447
+ error_msg = f"Row {row_idx}: Missing required column '{col_name}' in data"
448
+ if self.logger:
449
+ self.logger.error(error_msg)
450
+ raise ValueError(error_msg)
451
+ prepared_row[col_name] = None
452
+ else:
453
+ try:
454
+ prepared_row[col_name] = self._validate_value(row[col_name], col_type)
455
+ except ValueError as e:
456
+ error_msg = f"Row {row_idx}, column '{col_name}': {str(e)}"
457
+ if self.logger:
458
+ self.logger.error(error_msg)
459
+ raise ValueError(error_msg)
460
+ prepared_data.append(prepared_row)
461
+
462
+ if self.logger:
463
+ self.logger.debug("已准备 %d 行数据", len(prepared_data))
464
+ return prepared_data
465
+
466
+ def _get_partition_table_name(self, table_name: str, date_value: str, partition_by: str) -> str:
467
+ """
468
+ 获取分表名称
469
+
470
+ :param table_name: 基础表名
471
+ :param date_value: 日期值
472
+ :param partition_by: 分表方式 ('year' 或 'month')
473
+ :return: 分表名称
474
+ :raises ValueError: 如果日期格式无效或分表方式无效
475
+ """
476
+ try:
477
+ date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d %H:%M:%S')
478
+ except ValueError:
479
+ try:
480
+ date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d')
481
+ except ValueError:
482
+ error_msg = f"无效的日期格式: {date_value}"
483
+ if self.logger:
484
+ self.logger.error("无效的日期格式: %s", date_value)
485
+ raise ValueError(error_msg)
486
+
487
+ if partition_by == 'year':
488
+ return f"{table_name}_{date_obj.year}"
489
+ elif partition_by == 'month':
490
+ return f"{table_name}_{date_obj.year}_{date_obj.month:02d}"
491
+ else:
492
+ error_msg = "partition_by must be 'year' or 'month'"
493
+ if self.logger:
494
+ self.logger.error(error_msg)
495
+ raise ValueError(error_msg)
496
+
497
+ def _create_table(
498
+ self,
499
+ db_name: str,
500
+ table_name: str,
501
+ columns: Dict[str, str],
502
+ primary_keys: Optional[List[str]] = None,
503
+ date_column: Optional[str] = None,
504
+ indexes: Optional[List[str]] = None,
505
+ unique_columns: Optional[List[str]] = None
506
+ ):
507
+ """
508
+ 创建数据表
509
+
510
+ :param db_name: 数据库名
511
+ :param table_name: 表名
512
+ :param columns: 列名和数据类型字典 {列名: 数据类型}
513
+ :param primary_keys: 主键列列表
514
+ :param date_column: 日期列名,如果存在将设置为索引
515
+ :param indexes: 需要创建索引的列列表
516
+ :param unique_columns: 需要创建唯一索引的列列表
517
+ """
518
+ db_name = self._validate_identifier(db_name)
519
+ table_name = self._validate_identifier(table_name)
520
+
521
+ if not columns:
522
+ error_msg = "No columns specified for table creation"
523
+ if self.logger:
524
+ self.logger.error(error_msg)
525
+ raise ValueError(error_msg)
526
+
527
+ # 构建列定义SQL
528
+ column_defs = ["`id` INT NOT NULL AUTO_INCREMENT"]
529
+
530
+ # 添加其他列定义
531
+ for col_name, col_type in columns.items():
532
+ # 跳过id列,因为已经在前面添加了
533
+ if col_name.lower() == 'id':
534
+ continue
535
+ safe_col_name = self._validate_identifier(col_name)
536
+ col_def = f"`{safe_col_name}` {col_type}"
537
+
538
+ # 添加NOT NULL约束
539
+ if not col_type.lower().startswith('json'):
540
+ col_def += " NOT NULL"
541
+
542
+ column_defs.append(col_def)
543
+
544
+ # 添加主键定义
545
+ if primary_keys:
546
+ # 确保id在主键中
547
+ if 'id' not in [pk.lower() for pk in primary_keys]:
548
+ primary_keys = ['id'] + primary_keys
549
+ else:
550
+ # 如果没有指定主键,则使用id作为主键
551
+ primary_keys = ['id']
552
+
553
+ # 添加主键定义
554
+ safe_primary_keys = [self._validate_identifier(pk) for pk in primary_keys]
555
+ primary_key_sql = f", PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
556
+
557
+ # 添加唯一索引定义
558
+ unique_index_sql = ""
559
+ if unique_columns:
560
+ for col in unique_columns:
561
+ if col.lower() != 'id' and col in columns:
562
+ safe_col = self._validate_identifier(col)
563
+ unique_index_sql += f", UNIQUE KEY `uk_{safe_col}` (`{safe_col}`)"
564
+
565
+ # 构建完整SQL
566
+ sql = f"""
567
+ CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
568
+ {','.join(column_defs)}
569
+ {primary_key_sql}
570
+ {unique_index_sql}
571
+ ) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
572
+ """
573
+
574
+ try:
575
+ with self._get_connection() as conn:
576
+ with conn.cursor() as cursor:
577
+ cursor.execute(sql)
578
+ if self.logger:
579
+ self.logger.info("表 %s.%s 创建成功", db_name, table_name)
580
+
581
+ # 添加普通索引
582
+ index_statements = []
583
+
584
+ # 日期列索引
585
+ if date_column and date_column in columns:
586
+ safe_date_col = self._validate_identifier(date_column)
587
+ index_statements.append(
588
+ f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_date_col}` (`{safe_date_col}`)"
589
+ )
590
+
591
+ # 其他索引
592
+ if indexes:
593
+ for idx_col in indexes:
594
+ if idx_col in columns:
595
+ safe_idx_col = self._validate_identifier(idx_col)
596
+ index_statements.append(
597
+ f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)"
598
+ )
599
+
600
+ # 执行所有索引创建语句
601
+ if index_statements:
602
+ with conn.cursor() as cursor:
603
+ for stmt in index_statements:
604
+ cursor.execute(stmt)
605
+ if self.logger:
606
+ self.logger.debug("Executed index statement: %s", stmt)
607
+
608
+ conn.commit()
609
+ if self.logger:
610
+ self.logger.info("All indexes created successfully for %s.%s", db_name, table_name)
611
+
612
+ except Exception as e:
613
+ if self.logger:
614
+ self.logger.error("创建表 %s.%s 失败: %s", db_name, table_name, str(e))
615
+ conn.rollback()
616
+ raise
617
+
618
+ def upload_data(
619
+ self,
620
+ db_name: str,
621
+ table_name: str,
622
+ data: Union[Dict, List[Dict], pd.DataFrame],
623
+ columns: Dict[str, str],
624
+ primary_keys: Optional[List[str]] = None,
625
+ check_duplicate: bool = False,
626
+ duplicate_columns: Optional[List[str]] = None,
627
+ allow_null: bool = False,
628
+ partition_by: Optional[str] = None,
629
+ partition_date_column: str = '日期',
630
+ auto_create: bool = True,
631
+ replace: bool = False,
632
+ indexes: Optional[List[str]] = None
633
+ ):
634
+ """
635
+ 上传数据到数据库
636
+
637
+ :param db_name: 数据库名
638
+ :param table_name: 表名
639
+ :param data: 要上传的数据
640
+ :param columns: 列名和数据类型字典 {列名: 数据类型}
641
+ :param primary_keys: 主键列列表
642
+ :param check_duplicate: 是否检查重复,默认为False
643
+ :param duplicate_columns: 用于检查重复的列列表,如果不指定则使用所有列
644
+ :param allow_null: 是否允许空值,默认为False
645
+ :param partition_by: 分表方式 ('year' 或 'month'),默认为None不分表
646
+ :param partition_date_column: 用于分表的日期列名,默认为'date'
647
+ :param auto_create: 是否自动创建不存在的数据库或表,默认为True
648
+ :param replace: 是否使用REPLACE代替INSERT,默认为False
649
+ :param indexes: 需要创建索引的列列表
650
+ :raises ValueError: 如果参数无效或操作失败
651
+ """
652
+ if self.logger:
653
+ self.logger.info(
654
+ "开始上传数据到 %s.%s (分表方式=%s, 替换模式=%s)",
655
+ db_name, table_name, partition_by, replace
656
+ )
657
+
658
+ # 验证参数
659
+ if not columns:
660
+ error_msg = "Columns specification is required"
661
+ if self.logger:
662
+ self.logger.error(error_msg)
663
+ raise ValueError(error_msg)
664
+
665
+ if partition_by and partition_by not in ['year', 'month']:
666
+ error_msg = "分表方式必须是 'year' 或 'month'"
667
+ if self.logger:
668
+ self.logger.error(error_msg)
669
+ raise ValueError(error_msg)
670
+
671
+ # 准备数据
672
+ prepared_data = self._prepare_data(data, columns, allow_null)
673
+
674
+ # 检查数据库是否存在
675
+ if not self._check_database_exists(db_name):
676
+ if auto_create:
677
+ self._create_database(db_name)
678
+ else:
679
+ error_msg = f"Database '{db_name}' does not exist"
680
+ if self.logger:
681
+ self.logger.error(error_msg)
682
+ raise ValueError(error_msg)
683
+
684
+ # 确定唯一索引列
685
+ unique_columns = None
686
+ if check_duplicate:
687
+ unique_columns = duplicate_columns if duplicate_columns else [col for col in columns.keys() if
688
+ col.lower() != 'id']
689
+
690
+ # 处理分表逻辑
691
+ if partition_by:
692
+ # 分组数据按分表
693
+ partitioned_data = {}
694
+ for row in prepared_data:
695
+ if partition_date_column not in row:
696
+ error_msg = f"异常缺失列 '{partition_date_column}'"
697
+ if self.logger:
698
+ self.logger.error(error_msg)
699
+ raise ValueError(error_msg)
700
+ part_table = self._get_partition_table_name(table_name, str(row[partition_date_column]), partition_by)
701
+ if part_table not in partitioned_data:
702
+ partitioned_data[part_table] = []
703
+ partitioned_data[part_table].append(row)
704
+
705
+ # 对每个分表执行上传
706
+ for part_table, part_data in partitioned_data.items():
707
+ self._upload_to_table(
708
+ db_name, part_table, part_data, columns,
709
+ primary_keys, check_duplicate, duplicate_columns,
710
+ allow_null, auto_create, partition_date_column,
711
+ replace, indexes, unique_columns
712
+ )
713
+ else:
714
+ # 不分表,直接上传
715
+ self._upload_to_table(
716
+ db_name, table_name, prepared_data, columns,
717
+ primary_keys, check_duplicate, duplicate_columns,
718
+ allow_null, auto_create, partition_date_column,
719
+ replace, indexes, unique_columns
720
+ )
721
+
722
+ if self.logger:
723
+ self.logger.info(
724
+ "成功上传 %d 行数据到 %s.%s",
725
+ len(prepared_data), db_name, table_name
726
+ )
727
+
728
+ def _upload_to_table(
729
+ self,
730
+ db_name: str,
731
+ table_name: str,
732
+ data: List[Dict],
733
+ columns: Dict[str, str],
734
+ primary_keys: Optional[List[str]],
735
+ check_duplicate: bool,
736
+ duplicate_columns: Optional[List[str]],
737
+ allow_null: bool,
738
+ auto_create: bool,
739
+ date_column: Optional[str],
740
+ replace: bool,
741
+ indexes: Optional[List[str]],
742
+ unique_columns: Optional[List[str]] = None
743
+ ):
744
+ """实际执行表上传的内部方法"""
745
+ # 检查表是否存在
746
+ if not self._check_table_exists(db_name, table_name):
747
+ if auto_create:
748
+ self._create_table(db_name, table_name, columns, primary_keys, date_column, indexes, unique_columns)
749
+ else:
750
+ error_msg = f"Table '{db_name}.{table_name}' does not exist"
751
+ if self.logger:
752
+ self.logger.error(error_msg)
753
+ raise ValueError(error_msg)
754
+
755
+ # 获取表结构并验证
756
+ table_columns = self._get_table_columns(db_name, table_name)
757
+ if not table_columns:
758
+ error_msg = f"Failed to get columns for table '{db_name}.{table_name}'"
759
+ if self.logger:
760
+ self.logger.error(error_msg)
761
+ raise ValueError(error_msg)
762
+
763
+ # 验证数据列与表列匹配
764
+ for col in columns:
765
+ if col not in table_columns:
766
+ error_msg = f"Column '{col}' not found in table '{db_name}.{table_name}'"
767
+ if self.logger:
768
+ self.logger.error(error_msg)
769
+ raise ValueError(error_msg)
770
+
771
+ # 插入数据
772
+ self._insert_data(
773
+ db_name, table_name, data, columns,
774
+ check_duplicate, duplicate_columns,
775
+ replace=replace
776
+ )
777
+
778
+ def _insert_data(
779
+ self,
780
+ db_name: str,
781
+ table_name: str,
782
+ data: List[Dict],
783
+ columns: Dict[str, str],
784
+ check_duplicate: bool = False,
785
+ duplicate_columns: Optional[List[str]] = None,
786
+ batch_size: int = 1000,
787
+ replace: bool = False
788
+ ):
789
+ """
790
+ 插入数据到表中
791
+
792
+ :param db_name: 数据库名
793
+ :param table_name: 表名
794
+ :param data: 要插入的数据
795
+ :param columns: 列名和数据类型字典
796
+ :param check_duplicate: 是否检查重复
797
+ :param duplicate_columns: 用于检查重复的列列表
798
+ :param batch_size: 批量插入的大小
799
+ :param replace: 是否使用REPLACE代替INSERT
800
+ :raises Exception: 如果插入失败
801
+ """
802
+ db_name = self._validate_identifier(db_name)
803
+ table_name = self._validate_identifier(table_name)
804
+
805
+ if not data:
806
+ if self.logger:
807
+ self.logger.warning("No data to insert into %s.%s", db_name, table_name)
808
+ return
809
+
810
+ # 获取所有列名
811
+ all_columns = [col for col in columns.keys() if col.lower() != 'id']
812
+ safe_columns = [self._validate_identifier(col) for col in all_columns]
813
+ placeholders = ','.join(['%s'] * len(safe_columns))
814
+
815
+ # 构建SQL语句
816
+ operation = "REPLACE" if replace else "INSERT IGNORE" if check_duplicate else "INSERT"
817
+
818
+ if check_duplicate and not replace:
819
+ # 当check_duplicate=True时,使用INSERT IGNORE来跳过重复记录
820
+ sql = f"""
821
+ {operation} INTO `{db_name}`.`{table_name}`
822
+ (`{'`,`'.join(safe_columns)}`)
823
+ VALUES ({placeholders})
824
+ """
825
+ else:
826
+ sql = f"""
827
+ {operation} INTO `{db_name}`.`{table_name}`
828
+ (`{'`,`'.join(safe_columns)}`)
829
+ VALUES ({placeholders})
830
+ """
831
+
832
+ if len(self._prepared_statements) >= self._max_cached_statements:
833
+ # 移除最旧的缓存
834
+ oldest_key = next(iter(self._prepared_statements))
835
+ del self._prepared_statements[oldest_key]
836
+
837
+ # 缓存预处理语句
838
+ cache_key = f"{db_name}.{table_name}.{operation}.{check_duplicate}"
839
+ if cache_key not in self._prepared_statements:
840
+ self._prepared_statements[cache_key] = sql
841
+ if self.logger:
842
+ self.logger.debug("已缓存预处理语句: %s", cache_key)
843
+
844
+ # 分批插入数据
845
+ with self._get_connection() as conn:
846
+ with conn.cursor() as cursor:
847
+ for i in range(0, len(data), batch_size):
848
+ batch = data[i:i + batch_size]
849
+ # 准备批量数据
850
+ values = []
851
+ for row in batch:
852
+ row_values = []
853
+ for col in all_columns:
854
+ row_values.append(row.get(col))
855
+ values.append(row_values)
856
+
857
+ # 执行批量插入
858
+ try:
859
+ start_time = time.time()
860
+ cursor.executemany(sql, values)
861
+ conn.commit() # 每个批次提交一次
862
+ if self.logger:
863
+ self.logger.debug(
864
+ "成功插入批次 %d-%d/%d 到 %s.%s, 耗时 %.2f 秒",
865
+ i + 1, min(i + batch_size, len(data)), len(data),
866
+ db_name, table_name, time.time() - start_time
867
+ )
868
+ except Exception as e:
869
+ conn.rollback()
870
+ error_msg = f"Failed to insert batch {i + 1}-{min(i + batch_size, len(data))}/{len(data)} into {db_name}.{table_name}: {str(e)}"
871
+ if self.logger:
872
+ self.logger.error(error_msg)
873
+ raise Exception(error_msg)
874
+
875
+ def close(self):
876
+ """关闭连接池"""
877
+ if hasattr(self, 'pool') and self.pool:
878
+ try:
879
+ # 先关闭所有连接
880
+ while True:
881
+ conn = getattr(self.pool, '_connections', None)
882
+ if not conn or not conn.queue:
883
+ break
884
+ try:
885
+ conn = self.pool.connection()
886
+ conn.close()
887
+ except:
888
+ pass
889
+
890
+ # 然后关闭连接池
891
+ self.pool.close()
892
+ if self.logger:
893
+ self.logger.info("连接池已成功关闭")
894
+ except Exception as e:
895
+ if self.logger:
896
+ self.logger.error("关闭连接池失败: %s", str(e))
897
+ raise
898
+ self.pool = None
899
+
900
+ def __enter__(self):
901
+ return self
902
+
903
+ def __exit__(self, exc_type, exc_val, exc_tb):
904
+ self.close()
905
+ if exc_type is not None and self.logger:
906
+ self.logger.error(
907
+ "Exception occurred: %s: %s",
908
+ exc_type.__name__, str(exc_val),
909
+ exc_info=(exc_type, exc_val, exc_tb)
910
+ )
911
+
912
+
46
913
  class MysqlUpload:
47
914
  def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
48
915
  self.username = username
@@ -1107,3 +1974,46 @@ class OptimizeDatas:
1107
1974
 
1108
1975
  if __name__ == '__main__':
1109
1976
  pass
1977
+
1978
+ # 初始化上传器
1979
+ uploader = MySQLUploader(
1980
+ username='root',
1981
+ password='1',
1982
+ host='localhost',
1983
+ port=3306,
1984
+ enable_logging=True,
1985
+ log_level='INFO'
1986
+ )
1987
+
1988
+ # 定义列和数据类型
1989
+ columns = {
1990
+ 'id': 'INT',
1991
+ 'name': 'VARCHAR(255)',
1992
+ 'age': 'INT',
1993
+ 'salary': 'DECIMAL(10,2)',
1994
+ '日期': 'DATE'
1995
+ }
1996
+
1997
+ # 准备数据
1998
+ data = [
1999
+ {'name': 'Alice', 'age': 30, 'salary': 50000.50, '日期': '2023-01-15'},
2000
+ {'name': 'Bob', 'age': 25, 'salary': 45000.75, '日期': '2023-02-20'},
2001
+ {'name': 'Charlie', 'age': 35, 'salary': 60000.00, '日期': '2023-01-10'}
2002
+ ]
2003
+
2004
+ # 上传数据
2005
+ uploader.upload_data(
2006
+ db_name='test_db',
2007
+ table_name='employees',
2008
+ data=data,
2009
+ columns=columns,
2010
+ primary_keys=[],
2011
+ check_duplicate=True,
2012
+ replace=True,
2013
+ duplicate_columns=['name'],
2014
+ allow_null=False,
2015
+ partition_by='month' # 按月分表
2016
+ )
2017
+
2018
+ # 关闭上传器
2019
+ uploader.close()
mdbq/spider/aikucun.py CHANGED
@@ -257,13 +257,13 @@ class AikuCun:
257
257
  )
258
258
  print(f'正在获取数据({num}/{len(date_list)}): {item_type}榜单 {date}')
259
259
  # print(res.json())
260
- if not res.json()['success']:
260
+ if not res.json().get('success', None):
261
261
  print('没有获取到数据, 请求不成功, 如果连续请求失败 > 5, 则需重新获取cookie后继续')
262
262
  num += 1
263
263
  self.error_count += 1
264
264
  time.sleep(1)
265
265
  continue
266
- if not res.json()['data']['rows']:
266
+ if not res.json().get('data', {}).get('rows', None):
267
267
  print("返回的数据字典异常, ['data']['rows'] 不能为空")
268
268
  num += 1
269
269
  self.error_count += 1
@@ -479,7 +479,10 @@ def main(start_date, end_date=None, item_type=['spu']):
479
479
 
480
480
  if __name__ == '__main__':
481
481
  main(
482
- start_date='2025-03-25',
483
- # end_date='2025-03-26', # 不传则默认到今天
484
- item_type=['spu', 'sku']
482
+ start_date='2025-05-13',
483
+ # end_date='2025-04-28', # 不传则默认到今天
484
+ item_type=[
485
+ # 'spu',
486
+ 'sku'
487
+ ]
485
488
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 3.8.19
3
+ Version: 3.9.1
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,5 +1,5 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=adbn7k6HeaPn3waOOAb2wMca7SSJxTRZ8MEbtLkPAeA,18
2
+ mdbq/__version__.py,sha256=LC0UP2VyG12RJ8LWMMTlZDfZCrzzKDWmgNe41Gr89BE,17
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
4
  mdbq/aggregation/optimize.py,sha256=2oalzD9weZhDclUC22OLxYa8Zj7KnmsGUoUau_Jlyc4,19796
5
5
  mdbq/aggregation/query_data.py,sha256=5_OzjGR5Sq00q-EgAYmSE5V9i4Solw9y4hkldl4mvt8,179808
@@ -8,7 +8,7 @@ mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
8
8
  mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
9
9
  mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
10
10
  mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
11
- mdbq/mysql/mysql.py,sha256=umcLpw5cYGNNJnEjBLh_bgBXeh5LntPKFm8VslQ01ow,55030
11
+ mdbq/mysql/mysql.py,sha256=2xjf7j-6PSAmxdubYjwkh71n0Rhdum-VCkvTmQN-V3U,91100
12
12
  mdbq/mysql/s_query.py,sha256=X055aLRAgxVvueXx4NbfNjp6MyBI02_XBb1pTKw09L0,8660
13
13
  mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
14
14
  mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
@@ -21,8 +21,8 @@ mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
21
21
  mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
22
22
  mdbq/redis/getredis.py,sha256=Uk8-cOWT0JU1qRyIVqdbYokSLvkDIAfcokmYj1ebw8k,24104
23
23
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
24
- mdbq/spider/aikucun.py,sha256=QfyUtXMuPZ5mJVNDUlFa_ltFXiCCTccBz6MT3YT-7HI,19742
25
- mdbq-3.8.19.dist-info/METADATA,sha256=yheVi8Il8o8Py_HUlKc8s-gCako_d8RktLqdNa-TPSA,364
26
- mdbq-3.8.19.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
27
- mdbq-3.8.19.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
28
- mdbq-3.8.19.dist-info/RECORD,,
24
+ mdbq/spider/aikucun.py,sha256=OhyEv1VyAKTOHjLDM37iNDQeRg5OnrNoKODoG2VxHes,19806
25
+ mdbq-3.9.1.dist-info/METADATA,sha256=2J1EsfDaQ5gl0dAjFvNDwF2U0xB1R9jmmICm9-XwH7s,363
26
+ mdbq-3.9.1.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
27
+ mdbq-3.9.1.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
28
+ mdbq-3.9.1.dist-info/RECORD,,
File without changes