mdbq 3.8.18__py3-none-any.whl → 3.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.8.18'
1
+ VERSION = '3.9.0'
@@ -0,0 +1,474 @@
1
+ # -*- coding:utf-8 -*-
2
+ import pymysql
3
+ import logging
4
+ from typing import List, Optional, Dict
5
+ import time
6
+ import re
7
+ import os
8
+ import hashlib
9
+ from dbutils.pooled_db import PooledDB
10
+ from mdbq.log import spider_logging
11
+ from mdbq.config import config
12
+ import threading
13
+ import queue
14
+
15
+ dir_path = os.path.expanduser("~")
16
+ config_file = os.path.join(dir_path, 'spd.txt')
17
+ my_cont = config.read_config(config_file)
18
+ username, password, port = my_cont['username'], my_cont['password'], my_cont['port']
19
+ host = '127.0.0.1'
20
+ logger = spider_logging.setup_logging(reMoveOldHandler=True, filename='optimize.log')
21
+
22
+
23
+ class MySQLDeduplicator:
24
+
25
+ def __init__(self, host: str, username: str, password: str, port: int = 3306):
26
+ self.pool = PooledDB(
27
+ creator=pymysql,
28
+ maxconnections=10, # 最大连接数
29
+ mincached=2, # 初始化空闲连接数
30
+ maxcached=5, # 空闲连接最大缓存数
31
+ blocking=True,
32
+ host=host,
33
+ port=int(port),
34
+ user=username,
35
+ password=password,
36
+ ping=1,
37
+ charset='utf8mb4',
38
+ cursorclass=pymysql.cursors.DictCursor
39
+ )
40
+ self.set_typ = {
41
+ '日期': 'date',
42
+ '更新时间': 'timestamp',
43
+ }
44
+ self.tables_to_reset = queue.Queue() # 线程安全队列
45
+ self.delay_time = 120 # 延迟重置自增 id
46
+ self.lock = threading.Lock() # 用于关键操作同步
47
+
48
+ def get_table_in_databases(self, db_list=None, reset_id=False):
49
+ """
50
+ reset_id: 是否重置自增 id
51
+ """
52
+ if not db_list:
53
+ return
54
+ connection = self.get_connection()
55
+ res = []
56
+ for db_name in db_list:
57
+ try:
58
+ with connection.cursor() as cursor:
59
+ cursor.execute(f"USE `{db_name}`")
60
+ cursor.execute("SHOW TABLES")
61
+ tables = cursor.fetchall()
62
+ for index, item in enumerate(tables):
63
+ res.append(
64
+ {
65
+ 'db_name': db_name,
66
+ 'table_name': item.get(f'Tables_in_{db_name}', ''),
67
+ 'reset_id': reset_id,
68
+ }
69
+ )
70
+ except:
71
+ pass
72
+ connection.close()
73
+ return res
74
+
75
+ def deduplicate(
76
+ self,
77
+ tables_list: List[Dict],
78
+ order_column: str = "更新时间",
79
+ order_direction: str = "DESC",
80
+ batch_size: int = 10000,
81
+ id_column: str = "id",
82
+ recent_months: Optional[int] = None
83
+ ) -> bool:
84
+ """
85
+ 执行多表去重操作
86
+ :param tables_list: 目标表配置列表,每个元素为字典,包含db_name, table_name, unique_keys(可选), reset_id(可选)
87
+ :param order_column: 排序字段
88
+ :param order_direction: 排序方向 (ASC/DESC)
89
+ :param batch_size: 批量删除批次大小
90
+ :param id_column: 自增列名称
91
+ :return: 是否全部成功
92
+ """
93
+ if recent_months is not None and (not isinstance(recent_months, int) or recent_months < 1):
94
+ logger.error("recent_months必须为None或正整数")
95
+ return False
96
+ for table_config in tables_list:
97
+ config = {
98
+ 'order_column': order_column,
99
+ 'order_direction': order_direction,
100
+ 'batch_size': batch_size,
101
+ 'id_column': id_column,
102
+ 'reset_id': table_config.get('reset_id', False), # 处理默认值
103
+ 'unique_keys': table_config.get('unique_keys', None),
104
+ 'recent_months': recent_months,
105
+ }
106
+ config.update(table_config)
107
+ self._deduplicate_single_table(**config)
108
+
109
+ def _deduplicate_single_table(
110
+ self,
111
+ db_name: str,
112
+ table_name: str,
113
+ unique_keys: Optional[List[str]],
114
+ order_column: str,
115
+ order_direction: str,
116
+ batch_size: int,
117
+ reset_id: bool,
118
+ id_column: str,
119
+ recent_months: Optional[int] = None
120
+ ):
121
+ """单表去重逻辑"""
122
+
123
+ # 获取数据库连接并检查有效性
124
+ connection = self.get_connection(db_name=db_name)
125
+ if not connection:
126
+ logger.error(f"连接数据库失败: {db_name}")
127
+ return False
128
+
129
+ temp_suffix = hashlib.md5(f"{table_name}{time.time()}".encode()).hexdigest()[:8]
130
+ temp_table = f"temp_{temp_suffix}"
131
+
132
+ try:
133
+ # 版本检查在check_db内部
134
+ if not self.check_db(db_name, table_name):
135
+ return False
136
+
137
+ with connection.cursor() as cursor:
138
+ # 主键重复检查
139
+ try:
140
+ cursor.execute(f"""
141
+ SELECT COUNT(*) AS total,
142
+ COUNT(DISTINCT `{id_column}`) AS distinct_count
143
+ FROM `{table_name}`
144
+ """)
145
+ except pymysql.err.InternalError as e:
146
+ if e.args[0] == pymysql.constants.ER.BAD_FIELD_ERROR:
147
+ logger.warning(f"{db_name}/{table_name} 跳过主键检查(无{id_column}列)")
148
+ else:
149
+ raise
150
+ else:
151
+ res = cursor.fetchone()
152
+ if res['total'] != res['distinct_count']:
153
+ logger.error(f"{db_name}/{table_name} 主键重复: {id_column}")
154
+ return False
155
+
156
+ all_columns = self._get_table_columns(db_name, table_name)
157
+ # 自动生成unique_keys逻辑
158
+ if not unique_keys:
159
+ exclude_set = {id_column.lower(), order_column.lower()}
160
+
161
+ if not all_columns:
162
+ logger.error(f"{db_name}/{table_name} 无法获取表列信息")
163
+ return False
164
+
165
+ # 排除id_column和order_column
166
+ unique_keys = [
167
+ col for col in all_columns
168
+ if col.lower() not in exclude_set
169
+ and col != id_column # 额外确保大小写兼容
170
+ and col != order_column
171
+ ]
172
+ # 检查剩余列是否有效
173
+ if not unique_keys:
174
+ unique_keys = all_columns
175
+ logger.warning(f"{db_name}/{table_name} 使用全列作为唯一键: {all_columns}")
176
+ return False
177
+ # logger.info(f"自动生成unique_keys: {unique_keys}")
178
+ else:
179
+ if not self._validate_columns(db_name, table_name, unique_keys):
180
+ logger.error(f"{db_name}/{table_name} unique_keys中存在无效列名")
181
+ return False
182
+
183
+ # 动态生成临时表名
184
+ partition_clause = ', '.join([f'`{col}`' for col in unique_keys])
185
+
186
+ # 使用参数化查询创建临时表
187
+ if self._validate_columns(db_name, table_name, [order_column]):
188
+ order_clause = f"ORDER BY `{order_column}` {order_direction}" if order_column else ""
189
+ else:
190
+ order_clause = ''
191
+
192
+ # 时间过滤
193
+ where_clause = ""
194
+ query_params = []
195
+ date_column_exists = '日期' in all_columns
196
+ if recent_months and recent_months > 0 and date_column_exists:
197
+ where_clause = "WHERE `日期` >= DATE_SUB(CURDATE(), INTERVAL %s MONTH)"
198
+ query_params.append(recent_months)
199
+ elif recent_months and not date_column_exists:
200
+ logger.warning(f"{db_name}/{table_name} 忽略recent_months参数(无日期列)")
201
+
202
+ create_temp_sql = f"""
203
+ CREATE TEMPORARY TABLE `{temp_table}` AS
204
+ SELECT tmp_id FROM (
205
+ SELECT `{id_column}` AS tmp_id,
206
+ ROW_NUMBER() OVER (
207
+ PARTITION BY {partition_clause or '1'}
208
+ {order_clause}
209
+ ) AS row_num
210
+ FROM `{table_name}`
211
+ {where_clause}
212
+ ) t WHERE row_num > 1;
213
+ """
214
+ cursor.execute(create_temp_sql, query_params)
215
+
216
+ logger.info(f'{db_name}/{table_name} 执行排重任务')
217
+ # 批量删除优化
218
+ iteration = 0
219
+ total_deleted = 0
220
+ while True and iteration < 10000:
221
+ iteration += 1
222
+ # 获取并删除临时表中的数据,避免重复处理
223
+ cursor.execute(f"""
224
+ SELECT tmp_id
225
+ FROM `{temp_table}`
226
+ LIMIT %s
227
+ FOR UPDATE;
228
+ """, (batch_size,))
229
+ batch = cursor.fetchall()
230
+ if not batch:
231
+ break
232
+ ids = [str(row['tmp_id']) for row in batch]
233
+ placeholder = ','.join(['%s'] * len(ids))
234
+
235
+ if ids:
236
+ try:
237
+ # 删除主表数据
238
+ cursor.execute(f"DELETE FROM `{table_name}` WHERE `{id_column}` IN ({placeholder})", ids)
239
+
240
+ # 删除临时表中已处理的记录
241
+ cursor.execute(f"DELETE FROM `{temp_table}` WHERE tmp_id IN ({placeholder})", ids)
242
+ except pymysql.err.InternalError as e:
243
+ if e.args[0] == pymysql.constants.ER.BAD_FIELD_ERROR:
244
+ logger.error(f"{db_name}/{table_name} 无法通过 {id_column} 删除记录,请检查列存在性")
245
+ return False
246
+ raise
247
+
248
+ total_deleted += cursor.rowcount
249
+ connection.commit()
250
+ logger.info(f"{db_name}/{table_name} 执行去重, 删除记录数: {total_deleted}")
251
+
252
+ if total_deleted > 0:
253
+ logger.info(f"{db_name}/{table_name} 删除记录数总计: {total_deleted}")
254
+
255
+ # 线程安全操作队列
256
+ if reset_id:
257
+ if not self._validate_columns(db_name, table_name, [id_column]):
258
+ return True
259
+
260
+ with self.lock:
261
+ self.tables_to_reset.put((db_name, table_name, id_column))
262
+ logger.info(f"{db_name}/{table_name} -> {self.delay_time}秒后重置自增id")
263
+ threading.Timer(self.delay_time, self.delayed_reset_auto_increment).start()
264
+
265
+ return True
266
+ except Exception as e:
267
+ logger.error(f"{db_name}/{table_name} 去重操作异常: {e}", exc_info=True)
268
+ connection.rollback()
269
+ return False
270
+ finally:
271
+ with connection.cursor() as cursor:
272
+ cursor.execute(f"DROP TEMPORARY TABLE IF EXISTS `{temp_table}`")
273
+ connection.close()
274
+
275
+ def _get_table_columns(self, db_name: str, table_name: str) -> List[str]:
276
+ """获取表的列"""
277
+ try:
278
+ connection = self.get_connection(db_name=db_name)
279
+ with connection.cursor() as cursor:
280
+ cursor.execute(f"SHOW COLUMNS FROM `{table_name}`")
281
+ return [row["Field"] for row in cursor.fetchall()]
282
+ except pymysql.Error as e:
283
+ logging.error(f"{db_name}/{table_name} 获取列失败: {e}")
284
+ return []
285
+
286
+ def check_db(self, db_name: str, table_name: str) -> bool:
287
+ """数据库检查"""
288
+ try:
289
+ with self.get_connection() as conn:
290
+ with conn.cursor() as cursor:
291
+ # 获取MySQL版本
292
+ version = self._check_mysql_version(cursor)
293
+ collation = 'utf8mb4_0900_ai_ci' if version >= 8.0 else 'utf8mb4_general_ci'
294
+
295
+ # 创建数据库
296
+ cursor.execute(f"""
297
+ CREATE DATABASE IF NOT EXISTS `{db_name}`
298
+ CHARACTER SET utf8mb4 COLLATE {collation}
299
+ """)
300
+ conn.commit()
301
+
302
+ # 切换数据库
303
+ cursor.execute(f"USE `{db_name}`")
304
+
305
+ # 检查表是否存在
306
+ if not self._table_exists(cursor, table_name):
307
+ self._create_table(cursor, table_name)
308
+ conn.commit()
309
+ return True
310
+ except Exception as e:
311
+ logger.error(f"{db_name}/{table_name} 数据库检查失败: {e}")
312
+ return False
313
+
314
+ def get_connection(self, db_name=None):
315
+ """从连接池获取连接"""
316
+ for _ in range(10):
317
+ try:
318
+ if db_name:
319
+ connection = self.pool.connection()
320
+ with connection.cursor() as cursor:
321
+ cursor.execute(f'use {db_name};')
322
+ return connection
323
+
324
+ return self.pool.connection()
325
+ except pymysql.Error as e:
326
+ logger.error(f"{db_name} 获取连接失败: {e}, 30秒后重试...")
327
+ time.sleep(30)
328
+ logger.error(f"{host}: {port} 数据库连接失败,已达最大重试次数")
329
+ return None
330
+
331
+ def _validate_identifier(self, name: str) -> bool:
332
+ """更严格的对象名验证(符合MySQL规范)"""
333
+ return re.match(r'^[\w$]+$', name) and len(name) <= 64
334
+
335
+ def _validate_columns(self, db_name: str, table_name: str, columns: List[str]) -> bool:
336
+ """验证列是否存在"""
337
+ if not all(self._validate_identifier(col) for col in columns):
338
+ return False
339
+ try:
340
+ connection = self.get_connection(db_name=db_name)
341
+ with connection.cursor() as cursor:
342
+ cursor.execute(f"SHOW COLUMNS FROM `{table_name}`")
343
+ existing_columns = {col['Field'] for col in cursor.fetchall()}
344
+ return all(col in existing_columns for col in columns)
345
+ except pymysql.Error as e:
346
+ logging.error(f"{db_name}/{table_name} 列验证失败: {e}")
347
+ return False
348
+
349
+ def _check_mysql_version(self, cursor) -> float:
350
+ """通过传入游标检查版本"""
351
+ cursor.execute("SELECT VERSION()")
352
+ return float(cursor.fetchone()['VERSION()'][:3])
353
+
354
+ def _table_exists(self, cursor, table_name: str) -> bool:
355
+ cursor.execute("SHOW TABLES LIKE %s", (table_name,))
356
+ return cursor.fetchone() is not None
357
+
358
+ def _create_table(self, cursor, table_name: str):
359
+ """安全建表逻辑"""
360
+ columns = ["`id` INT AUTO_INCREMENT PRIMARY KEY"]
361
+ for cn, ct in self.set_typ.items():
362
+ col_def = f"`{cn}` {ct.upper()} NOT NULL DEFAULT "
363
+ if 'INT' in ct:
364
+ col_def += '0'
365
+ elif 'TIMESTAMP' in ct:
366
+ col_def += 'CURRENT_TIMESTAMP'
367
+ else:
368
+ col_def += "''"
369
+ columns.append(col_def)
370
+ cursor.execute(f"""
371
+ CREATE TABLE `{table_name}` (
372
+ {', '.join(columns)}
373
+ ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
374
+ """)
375
+
376
+ def delayed_reset_auto_increment(self):
377
+ """线程安全的自增ID重置"""
378
+ while not self.tables_to_reset.empty():
379
+ try:
380
+ item = self.tables_to_reset.get_nowait()
381
+ self._safe_reset_auto_increment(*item)
382
+ except queue.Empty:
383
+ break
384
+
385
+ def _safe_reset_auto_increment(self, db_name: str, table_name: str, id_column: str):
386
+ """安全重置自增ID"""
387
+ with self.get_connection(db_name) as conn:
388
+ try:
389
+ with conn.cursor() as cursor:
390
+ cursor.execute("START TRANSACTION")
391
+ temp_table = f"reset_{hashlib.md5(table_name.encode()).hexdigest()[:8]}"
392
+ backup_table = f"{table_name}_backup_{int(time.time())}"
393
+ cursor.execute(f"CREATE TABLE `{temp_table}` LIKE `{table_name}`")
394
+ cursor.execute(f"ALTER TABLE `{temp_table}` MODIFY COLUMN `{id_column}` INT NOT NULL")
395
+ columns = self._get_table_columns(db_name, table_name)
396
+ if id_column not in columns:
397
+ logger.error(f"列 {id_column} 不存在于表 {table_name}")
398
+ return False
399
+ columns.remove(id_column)
400
+ columns_str = ', '.join([f'`{col}`' for col in columns])
401
+ insert_sql = f"""
402
+ INSERT INTO `{temp_table}` (`{id_column}`, {columns_str})
403
+ SELECT ROW_NUMBER() OVER (ORDER BY `{id_column}`), {columns_str}
404
+ FROM `{table_name}` ORDER BY `{id_column}`
405
+ """
406
+ cursor.execute(insert_sql)
407
+ cursor.execute(f"RENAME TABLE `{table_name}` TO `{backup_table}`, `{temp_table}` TO `{table_name}`")
408
+ cursor.execute(f"ALTER TABLE `{table_name}` MODIFY COLUMN `{id_column}` INT AUTO_INCREMENT")
409
+ cursor.execute(f"SELECT MAX(`{id_column}`) + 1 AS next_id FROM `{table_name}`")
410
+ next_id = cursor.fetchone()['next_id'] or 1
411
+ cursor.execute(f"ALTER TABLE `{table_name}` AUTO_INCREMENT = {next_id}")
412
+ cursor.execute(f"DROP TABLE IF EXISTS `{backup_table}`")
413
+ cursor.execute(f"DROP TEMPORARY TABLE IF EXISTS `{temp_table}`")
414
+ cursor.execute("COMMIT")
415
+ logger.info(f'{db_name}/{table_name} 已重置自增id')
416
+ except Exception as e:
417
+ logger.error(f"{db_name}/{table_name} 重置自增id失败: {e}")
418
+ cursor.execute("ROLLBACK")
419
+ return False
420
+ finally:
421
+ conn.close()
422
+
423
+
424
+ def main():
425
+ op = MySQLDeduplicator(
426
+ host=host,
427
+ username=username,
428
+ password=password,
429
+ port=port
430
+ )
431
+ op.delay_time = 600
432
+ # tables_list = [
433
+ # {
434
+ # 'db_name': "测试库",
435
+ # 'table_name': "测试库2",
436
+ # 'reset_id': True, # 可选, 默认 False
437
+ # # 'unique_keys': ["日期", "店铺名称", "商品id"]
438
+ # }
439
+ # ]
440
+ db_list = [
441
+ "京东数据3",
442
+ "属性设置3",
443
+ "推广数据2",
444
+ "推广数据_圣积天猫店",
445
+ "推广数据_淘宝店",
446
+ "推广数据_奥莱店",
447
+ "爱库存2",
448
+ "生意参谋3",
449
+ "生意经3",
450
+ "达摩盘3",
451
+ '人群画像2',
452
+ '商品人群画像2',
453
+ '市场数据3',
454
+ # '数据银行2'
455
+ # '回传数据',
456
+ # '大模型库',
457
+ '安全组',
458
+ # '视频数据',
459
+ # '聚合数据',
460
+ ]
461
+ tables_list = op.get_table_in_databases(db_list=db_list, reset_id=False)
462
+ op.deduplicate(
463
+ order_column = "更新时间",
464
+ order_direction = "DESC",
465
+ batch_size = 1000,
466
+ id_column = "id",
467
+ tables_list=tables_list,
468
+ recent_months=3,
469
+ )
470
+ logger.info(f'全部任务完成')
471
+
472
+
473
+ if __name__ == "__main__":
474
+ main()
mdbq/mysql/mysql.py CHANGED
@@ -12,6 +12,9 @@ import os
12
12
  import logging
13
13
  from mdbq.other import otk
14
14
 
15
+ from dbutils.pooled_db import PooledDB
16
+ from typing import Union, List, Dict, Optional, Any, Tuple
17
+
15
18
  warnings.filterwarnings('ignore')
16
19
  """
17
20
  建表流程:
@@ -43,6 +46,520 @@ def count_decimal_places(num_str):
43
46
  return 0, 0
44
47
 
45
48
 
49
+ class MySQLUploader:
50
+ def __init__(
51
+ self,
52
+ username: str,
53
+ password: str,
54
+ host: str = 'localhost',
55
+ port: int = 3306,
56
+ charset: str = 'utf8mb4',
57
+ collation: str = 'utf8mb4_0900_ai_ci',
58
+ enable_logging: bool = False,
59
+ log_level: str = 'ERROR',
60
+ max_retries: int = 10,
61
+ retry_interval: int = 10,
62
+ pool_size: int = 5
63
+ ):
64
+ """
65
+ 初始化MySQL上传工具
66
+
67
+ :param username: 数据库用户名
68
+ :param password: 数据库密码
69
+ :param host: 数据库主机地址,默认为localhost
70
+ :param port: 数据库端口,默认为3306
71
+ :param charset: 字符集,默认为utf8mb4
72
+ :param collation: 排序规则,默认为utf8mb4_0900_ai_ci
73
+ :param enable_logging: 是否启用日志,默认为False
74
+ :param log_level: 日志级别,默认为ERROR
75
+ :param max_retries: 最大重试次数,默认为10
76
+ :param retry_interval: 重试间隔(秒),默认为10
77
+ :param pool_size: 连接池大小,默认为5
78
+ """
79
+ self.username = username
80
+ self.password = password
81
+ self.host = host
82
+ self.port = port
83
+ self.charset = charset
84
+ self.collation = collation
85
+ self.max_retries = max_retries
86
+ self.retry_interval = retry_interval
87
+ self.pool_size = pool_size
88
+
89
+ # 初始化日志
90
+ if enable_logging:
91
+ self._init_logging(log_level)
92
+
93
+ # 创建连接池
94
+ self.pool = self._create_connection_pool()
95
+
96
+ def _init_logging(self, log_level: str):
97
+ """初始化日志配置"""
98
+ logging.basicConfig(
99
+ level=getattr(logging, log_level.upper(), logging.ERROR),
100
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
101
+ )
102
+ self.logger = logging.getLogger('MySQLUploader')
103
+
104
+ def _create_connection_pool(self):
105
+ """创建数据库连接池"""
106
+ return PooledDB(
107
+ creator=pymysql,
108
+ host=self.host,
109
+ port=self.port,
110
+ user=self.username,
111
+ password=self.password,
112
+ charset=self.charset,
113
+ maxconnections=self.pool_size,
114
+ cursorclass=pymysql.cursors.DictCursor
115
+ )
116
+
117
+ def _validate_identifier(self, identifier: str) -> str:
118
+ """
119
+ 验证并清理数据库标识符(数据库名、表名、列名)
120
+ 防止SQL注入和非法字符
121
+
122
+ :param identifier: 要验证的标识符
123
+ :return: 清理后的安全标识符
124
+ """
125
+ if not identifier or not isinstance(identifier, str):
126
+ raise ValueError(f"Invalid identifier: {identifier}")
127
+
128
+ # 移除可能有害的字符
129
+ cleaned = re.sub(r'[^a-zA-Z0-9_$]', '', identifier)
130
+ if not cleaned:
131
+ raise ValueError(f"Invalid identifier after cleaning: {identifier}")
132
+
133
+ return cleaned
134
+
135
+ def _validate_value(self, value: Any, column_type: str) -> Any:
136
+ """
137
+ 验证并清理数据值,根据列类型进行适当转换
138
+
139
+ :param value: 要验证的值
140
+ :param column_type: 列的数据类型
141
+ :return: 清理后的值
142
+ """
143
+ if value is None:
144
+ return None
145
+
146
+ try:
147
+ if 'int' in column_type.lower():
148
+ return int(value) if value is not None else None
149
+ elif 'float' in column_type.lower() or 'double' in column_type.lower() or 'decimal' in column_type.lower():
150
+ return float(value) if value is not None else None
151
+ elif 'date' in column_type.lower() or 'time' in column_type.lower():
152
+ if isinstance(value, (datetime.datetime, pd.Timestamp)):
153
+ return value.strftime('%Y-%m-%d %H:%M:%S')
154
+ return str(value)
155
+ elif 'char' in column_type.lower() or 'text' in column_type.lower():
156
+ return str(value)
157
+ else:
158
+ return value
159
+ except (ValueError, TypeError) as e:
160
+ raise ValueError(f"Failed to convert value {value} to type {column_type}: {str(e)}")
161
+
162
+ def _execute_with_retry(self, func, *args, **kwargs):
163
+ """
164
+ 带重试机制的SQL执行装饰器
165
+
166
+ :param func: 要执行的函数
167
+ :param args: 位置参数
168
+ :param kwargs: 关键字参数
169
+ :return: 函数执行结果
170
+ """
171
+
172
+ @wraps(func)
173
+ def wrapper(*args, **kwargs):
174
+ last_exception = None
175
+ for attempt in range(self.max_retries):
176
+ try:
177
+ return func(*args, **kwargs)
178
+ except pymysql.OperationalError as e:
179
+ last_exception = e
180
+ if attempt < self.max_retries - 1:
181
+ time.sleep(self.retry_interval)
182
+ # 尝试重新连接
183
+ self.pool = self._create_connection_pool()
184
+ continue
185
+ raise last_exception if last_exception else Exception("Unknown error occurred")
186
+
187
+ return wrapper(*args, **kwargs)
188
+
189
+ def _get_connection(self):
190
+ """从连接池获取连接"""
191
+ return self.pool.connection()
192
+
193
+ def _check_database_exists(self, db_name: str) -> bool:
194
+ """检查数据库是否存在"""
195
+ db_name = self._validate_identifier(db_name)
196
+ sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
197
+
198
+ with self._get_connection() as conn:
199
+ with conn.cursor() as cursor:
200
+ cursor.execute(sql, (db_name,))
201
+ return bool(cursor.fetchone())
202
+
203
+ def _create_database(self, db_name: str):
204
+ """创建数据库"""
205
+ db_name = self._validate_identifier(db_name)
206
+ sql = f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET {self.charset} COLLATE {self.collation}"
207
+
208
+ with self._get_connection() as conn:
209
+ with conn.cursor() as cursor:
210
+ cursor.execute(sql)
211
+ conn.commit()
212
+
213
+ def _check_table_exists(self, db_name: str, table_name: str) -> bool:
214
+ """检查表是否存在"""
215
+ db_name = self._validate_identifier(db_name)
216
+ table_name = self._validate_identifier(table_name)
217
+ sql = """
218
+ SELECT TABLE_NAME
219
+ FROM INFORMATION_SCHEMA.TABLES
220
+ WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
221
+ """
222
+
223
+ with self._get_connection() as conn:
224
+ with conn.cursor() as cursor:
225
+ cursor.execute(sql, (db_name, table_name))
226
+ return bool(cursor.fetchone())
227
+
228
+ def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
229
+ """获取表的列名和数据类型"""
230
+ db_name = self._validate_identifier(db_name)
231
+ table_name = self._validate_identifier(table_name)
232
+ sql = """
233
+ SELECT COLUMN_NAME, DATA_TYPE
234
+ FROM INFORMATION_SCHEMA.COLUMNS
235
+ WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
236
+ """
237
+
238
+ with self._get_connection() as conn:
239
+ with conn.cursor() as cursor:
240
+ cursor.execute(sql, (db_name, table_name))
241
+ return {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
242
+
243
+ def _create_table(
244
+ self,
245
+ db_name: str,
246
+ table_name: str,
247
+ columns: Dict[str, str],
248
+ primary_keys: Optional[List[str]] = None,
249
+ date_column: Optional[str] = None
250
+ ):
251
+ """
252
+ 创建数据表
253
+
254
+ :param db_name: 数据库名
255
+ :param table_name: 表名
256
+ :param columns: 列名和数据类型字典 {列名: 数据类型}
257
+ :param primary_keys: 主键列列表
258
+ :param date_column: 日期列名,如果存在将设置为索引
259
+ """
260
+ db_name = self._validate_identifier(db_name)
261
+ table_name = self._validate_identifier(table_name)
262
+
263
+ if not columns:
264
+ raise ValueError("No columns specified for table creation")
265
+
266
+ # 构建列定义SQL
267
+ column_defs = []
268
+ for col_name, col_type in columns.items():
269
+ safe_col_name = self._validate_identifier(col_name)
270
+ col_def = f"`{safe_col_name}` {col_type}"
271
+ column_defs.append(col_def)
272
+
273
+ # 添加主键定义
274
+ primary_key_sql = ""
275
+ if primary_keys:
276
+ safe_primary_keys = [self._validate_identifier(pk) for pk in primary_keys]
277
+ primary_key_sql = f", PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
278
+
279
+ # 构建完整SQL
280
+ sql = f"""
281
+ CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
282
+ {','.join(column_defs)}
283
+ {primary_key_sql}
284
+ ) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
285
+ """
286
+
287
+ with self._get_connection() as conn:
288
+ with conn.cursor() as cursor:
289
+ cursor.execute(sql)
290
+
291
+ # 如果存在日期列,添加索引
292
+ if date_column and date_column in columns:
293
+ safe_date_col = self._validate_identifier(date_column)
294
+ index_sql = f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_date_col}` (`{safe_date_col}`)"
295
+ with conn.cursor() as cursor:
296
+ cursor.execute(index_sql)
297
+
298
+ conn.commit()
299
+
300
+ def _prepare_data(
301
+ self,
302
+ data: Union[Dict, List[Dict], pd.DataFrame],
303
+ columns: Dict[str, str],
304
+ allow_null: bool = False
305
+ ) -> List[Dict]:
306
+ """
307
+ 准备要上传的数据,验证并转换数据类型
308
+
309
+ :param data: 输入数据
310
+ :param columns: 列名和数据类型字典 {列名: 数据类型}
311
+ :param allow_null: 是否允许空值
312
+ :return: 准备好的数据列表
313
+ """
314
+ # 统一数据格式为字典列表
315
+ if isinstance(data, pd.DataFrame):
316
+ data = data.to_dict('records')
317
+ elif isinstance(data, dict):
318
+ data = [data]
319
+
320
+ if not isinstance(data, list) or not all(isinstance(item, dict) for item in data):
321
+ raise ValueError("Data must be a dict, list of dicts, or DataFrame")
322
+
323
+ prepared_data = []
324
+ for row in data:
325
+ prepared_row = {}
326
+ for col_name, col_type in columns.items():
327
+ if col_name not in row:
328
+ if not allow_null:
329
+ raise ValueError(f"Missing required column '{col_name}' in data")
330
+ prepared_row[col_name] = None
331
+ else:
332
+ try:
333
+ prepared_row[col_name] = self._validate_value(row[col_name], col_type)
334
+ except ValueError as e:
335
+ raise ValueError(f"Error in column '{col_name}': {str(e)}")
336
+ prepared_data.append(prepared_row)
337
+
338
+ return prepared_data
339
+
340
+ def _get_partition_table_name(self, table_name: str, date_value: str, partition_by: str) -> str:
341
+ """
342
+ 获取分表名称
343
+
344
+ :param table_name: 基础表名
345
+ :param date_value: 日期值
346
+ :param partition_by: 分表方式 ('year' 或 'month')
347
+ :return: 分表名称
348
+ """
349
+ try:
350
+ date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d %H:%M:%S')
351
+ except ValueError:
352
+ try:
353
+ date_obj = datetime.datetime.strptime(date_value, '%Y-%m-%d')
354
+ except ValueError:
355
+ raise ValueError(f"Invalid date format: {date_value}")
356
+
357
+ if partition_by == 'year':
358
+ return f"{table_name}_{date_obj.year}"
359
+ elif partition_by == 'month':
360
+ return f"{table_name}_{date_obj.year}_{date_obj.month:02d}"
361
+ else:
362
+ raise ValueError("partition_by must be 'year' or 'month'")
363
+
364
+ def _insert_data(
365
+ self,
366
+ db_name: str,
367
+ table_name: str,
368
+ data: List[Dict],
369
+ columns: Dict[str, str],
370
+ check_duplicate: bool = False,
371
+ duplicate_columns: Optional[List[str]] = None,
372
+ batch_size: int = 1000
373
+ ):
374
+ """
375
+ 插入数据到表中
376
+
377
+ :param db_name: 数据库名
378
+ :param table_name: 表名
379
+ :param data: 要插入的数据
380
+ :param columns: 列名和数据类型字典
381
+ :param check_duplicate: 是否检查重复
382
+ :param duplicate_columns: 用于检查重复的列列表
383
+ :param batch_size: 批量插入的大小
384
+ """
385
+ db_name = self._validate_identifier(db_name)
386
+ table_name = self._validate_identifier(table_name)
387
+
388
+ if not data:
389
+ return
390
+
391
+ # 获取所有列名
392
+ all_columns = list(columns.keys())
393
+ safe_columns = [self._validate_identifier(col) for col in all_columns]
394
+ placeholders = ','.join(['%s'] * len(safe_columns))
395
+
396
+ # 构建INSERT SQL
397
+ if check_duplicate:
398
+ if duplicate_columns:
399
+ # 只检查指定列的重复
400
+ dup_columns = [self._validate_identifier(col) for col in duplicate_columns]
401
+ else:
402
+ # 检查所有列的重复
403
+ dup_columns = safe_columns
404
+
405
+ # 构建ON DUPLICATE KEY UPDATE子句
406
+ update_clause = ','.join([f"`{col}`=VALUES(`{col}`)" for col in safe_columns])
407
+
408
+ sql = f"""
409
+ INSERT INTO `{db_name}`.`{table_name}`
410
+ (`{'`,`'.join(safe_columns)}`)
411
+ VALUES ({placeholders})
412
+ ON DUPLICATE KEY UPDATE {update_clause}
413
+ """
414
+ else:
415
+ sql = f"""
416
+ INSERT INTO `{db_name}`.`{table_name}`
417
+ (`{'`,`'.join(safe_columns)}`)
418
+ VALUES ({placeholders})
419
+ """
420
+
421
+ # 分批插入数据
422
+ with self._get_connection() as conn:
423
+ with conn.cursor() as cursor:
424
+ for i in range(0, len(data), batch_size):
425
+ batch = data[i:i + batch_size]
426
+ # 准备批量数据
427
+ values = []
428
+ for row in batch:
429
+ row_values = []
430
+ for col in all_columns:
431
+ row_values.append(row.get(col))
432
+ values.append(row_values)
433
+
434
+ # 执行批量插入
435
+ try:
436
+ cursor.executemany(sql, values)
437
+ conn.commit()
438
+ except Exception as e:
439
+ conn.rollback()
440
+ raise e
441
+
442
+ def upload_data(
443
+ self,
444
+ db_name: str,
445
+ table_name: str,
446
+ data: Union[Dict, List[Dict], pd.DataFrame],
447
+ columns: Dict[str, str],
448
+ primary_keys: Optional[List[str]] = None,
449
+ check_duplicate: bool = False,
450
+ duplicate_columns: Optional[List[str]] = None,
451
+ allow_null: bool = False,
452
+ partition_by: Optional[str] = None,
453
+ partition_date_column: str = '日期',
454
+ auto_create: bool = True
455
+ ):
456
+ """
457
+ 上传数据到数据库
458
+
459
+ :param db_name: 数据库名
460
+ :param table_name: 表名
461
+ :param data: 要上传的数据
462
+ :param columns: 列名和数据类型字典 {列名: 数据类型}
463
+ :param primary_keys: 主键列列表
464
+ :param check_duplicate: 是否检查重复,默认为False
465
+ :param duplicate_columns: 用于检查重复的列列表,如果不指定则使用所有列
466
+ :param allow_null: 是否允许空值,默认为False
467
+ :param partition_by: 分表方式 ('year' 或 'month'),默认为None不分表
468
+ :param partition_date_column: 用于分表的日期列名,默认为'日期'
469
+ :param auto_create: 是否自动创建不存在的数据库或表,默认为True
470
+ """
471
+ # 验证参数
472
+ if not columns:
473
+ raise ValueError("Columns specification is required")
474
+
475
+ if partition_by and partition_by not in ['year', 'month']:
476
+ raise ValueError("partition_by must be 'year', 'month' or None")
477
+
478
+ # 准备数据
479
+ prepared_data = self._prepare_data(data, columns, allow_null)
480
+
481
+ # 检查数据库是否存在
482
+ if not self._check_database_exists(db_name):
483
+ if auto_create:
484
+ self._create_database(db_name)
485
+ else:
486
+ raise ValueError(f"Database '{db_name}' does not exist")
487
+
488
+ # 处理分表逻辑
489
+ if partition_by:
490
+ # 分组数据按分表
491
+ partitioned_data = {}
492
+ for row in prepared_data:
493
+ if partition_date_column not in row:
494
+ raise ValueError(f"Partition date column '{partition_date_column}' not found in data")
495
+ part_table = self._get_partition_table_name(table_name, str(row[partition_date_column]), partition_by)
496
+ if part_table not in partitioned_data:
497
+ partitioned_data[part_table] = []
498
+ partitioned_data[part_table].append(row)
499
+
500
+ # 对每个分表执行上传
501
+ for part_table, part_data in partitioned_data.items():
502
+ self._upload_to_table(
503
+ db_name, part_table, part_data, columns,
504
+ primary_keys, check_duplicate, duplicate_columns,
505
+ allow_null, auto_create, partition_date_column
506
+ )
507
+ else:
508
+ # 不分表,直接上传
509
+ self._upload_to_table(
510
+ db_name, table_name, prepared_data, columns,
511
+ primary_keys, check_duplicate, duplicate_columns,
512
+ allow_null, auto_create, partition_date_column
513
+ )
514
+
515
+ def _upload_to_table(
516
+ self,
517
+ db_name: str,
518
+ table_name: str,
519
+ data: List[Dict],
520
+ columns: Dict[str, str],
521
+ primary_keys: Optional[List[str]],
522
+ check_duplicate: bool,
523
+ duplicate_columns: Optional[List[str]],
524
+ allow_null: bool,
525
+ auto_create: bool,
526
+ date_column: Optional[str]
527
+ ):
528
+ """实际执行表上传的内部方法"""
529
+ # 检查表是否存在
530
+ if not self._check_table_exists(db_name, table_name):
531
+ if auto_create:
532
+ self._create_table(db_name, table_name, columns, primary_keys, date_column)
533
+ else:
534
+ raise ValueError(f"Table '{db_name}.{table_name}' does not exist")
535
+
536
+ # 获取表结构并验证
537
+ table_columns = self._get_table_columns(db_name, table_name)
538
+ if not table_columns:
539
+ raise ValueError(f"Failed to get columns for table '{db_name}.{table_name}'")
540
+
541
+ # 验证数据列与表列匹配
542
+ for col in columns:
543
+ if col not in table_columns:
544
+ raise ValueError(f"Column '{col}' not found in table '{db_name}.{table_name}'")
545
+
546
+ # 插入数据
547
+ self._insert_data(
548
+ db_name, table_name, data, columns,
549
+ check_duplicate, duplicate_columns
550
+ )
551
+
552
+ def close(self):
553
+ """关闭连接池"""
554
+ self.pool.close()
555
+
556
+ def __enter__(self):
557
+ return self
558
+
559
+ def __exit__(self, exc_type, exc_val, exc_tb):
560
+ self.close()
561
+
562
+
46
563
  class MysqlUpload:
47
564
  def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
48
565
  self.username = username
mdbq/spider/aikucun.py CHANGED
@@ -257,13 +257,13 @@ class AikuCun:
257
257
  )
258
258
  print(f'正在获取数据({num}/{len(date_list)}): {item_type}榜单 {date}')
259
259
  # print(res.json())
260
- if not res.json()['success']:
260
+ if not res.json().get('success', None):
261
261
  print('没有获取到数据, 请求不成功, 如果连续请求失败 > 5, 则需重新获取cookie后继续')
262
262
  num += 1
263
263
  self.error_count += 1
264
264
  time.sleep(1)
265
265
  continue
266
- if not res.json()['data']['rows']:
266
+ if not res.json().get('data', {}).get('rows', None):
267
267
  print("返回的数据字典异常, ['data']['rows'] 不能为空")
268
268
  num += 1
269
269
  self.error_count += 1
@@ -479,7 +479,10 @@ def main(start_date, end_date=None, item_type=['spu']):
479
479
 
480
480
  if __name__ == '__main__':
481
481
  main(
482
- start_date='2025-03-25',
483
- # end_date='2025-03-26', # 不传则默认到今天
484
- item_type=['spu', 'sku']
482
+ start_date='2025-05-13',
483
+ # end_date='2025-04-28', # 不传则默认到今天
484
+ item_type=[
485
+ # 'spu',
486
+ 'sku'
487
+ ]
485
488
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 3.8.18
3
+ Version: 3.9.0
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,13 +1,14 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=kflFjk3ujk2iECHCD3Bw3eVeZ9O3eAB76MG428NMJM0,18
2
+ mdbq/__version__.py,sha256=7hLUrBXQAGj08UbPoot9b_BwLXRkc-RH_nJSvG9AqTc,17
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
+ mdbq/aggregation/optimize.py,sha256=2oalzD9weZhDclUC22OLxYa8Zj7KnmsGUoUau_Jlyc4,19796
4
5
  mdbq/aggregation/query_data.py,sha256=5_OzjGR5Sq00q-EgAYmSE5V9i4Solw9y4hkldl4mvt8,179808
5
6
  mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
6
7
  mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
7
8
  mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
8
9
  mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
9
10
  mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
10
- mdbq/mysql/mysql.py,sha256=umcLpw5cYGNNJnEjBLh_bgBXeh5LntPKFm8VslQ01ow,55030
11
+ mdbq/mysql/mysql.py,sha256=L_UR7TqcZoHZj6dWVZe-ai6X2yc_oULPyUzKy7DHbOw,74493
11
12
  mdbq/mysql/s_query.py,sha256=X055aLRAgxVvueXx4NbfNjp6MyBI02_XBb1pTKw09L0,8660
12
13
  mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
13
14
  mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
@@ -20,8 +21,8 @@ mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
20
21
  mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
21
22
  mdbq/redis/getredis.py,sha256=Uk8-cOWT0JU1qRyIVqdbYokSLvkDIAfcokmYj1ebw8k,24104
22
23
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
23
- mdbq/spider/aikucun.py,sha256=QfyUtXMuPZ5mJVNDUlFa_ltFXiCCTccBz6MT3YT-7HI,19742
24
- mdbq-3.8.18.dist-info/METADATA,sha256=tQFP7EyTad4nohSxELsIYAuvw8R4eQaVCxu8NDh9OD4,364
25
- mdbq-3.8.18.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
26
- mdbq-3.8.18.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
27
- mdbq-3.8.18.dist-info/RECORD,,
24
+ mdbq/spider/aikucun.py,sha256=OhyEv1VyAKTOHjLDM37iNDQeRg5OnrNoKODoG2VxHes,19806
25
+ mdbq-3.9.0.dist-info/METADATA,sha256=pd--meyNjH8KaX-ZgnSHqWN9GbEyx59Atb1Wgs6BqVc,363
26
+ mdbq-3.9.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
27
+ mdbq-3.9.0.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
28
+ mdbq-3.9.0.dist-info/RECORD,,
File without changes