mdbq 3.10.7__py3-none-any.whl → 3.10.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/aggregation/optimize.py +1 -0
- mdbq/aggregation/query_data.py +2 -0
- mdbq/log/mylogger.py +8 -19
- mdbq/mysql/deduplicator.py +30 -22
- mdbq/mysql/mysql.py +336 -280
- mdbq/mysql/s_query.py +159 -143
- mdbq/mysql/uploader.py +125 -52
- mdbq/redis/getredis.py +0 -2
- {mdbq-3.10.7.dist-info → mdbq-3.10.9.dist-info}/METADATA +1 -1
- {mdbq-3.10.7.dist-info → mdbq-3.10.9.dist-info}/RECORD +13 -13
- {mdbq-3.10.7.dist-info → mdbq-3.10.9.dist-info}/WHEEL +0 -0
- {mdbq-3.10.7.dist-info → mdbq-3.10.9.dist-info}/top_level.txt +0 -0
mdbq/mysql/mysql.py
CHANGED
@@ -55,41 +55,61 @@ def count_decimal_places(num_str):
|
|
55
55
|
|
56
56
|
|
57
57
|
class MysqlUpload:
|
58
|
+
"""
|
59
|
+
MySQL 数据上传与表结构自动维护工具类。
|
60
|
+
支持字典/数据框批量插入、自动建表、自动补全字段、类型推断、增量更新等。
|
61
|
+
"""
|
58
62
|
def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
+
"""
|
64
|
+
初始化 MysqlUpload 实例。
|
65
|
+
:param username: 数据库用户名
|
66
|
+
:param password: 数据库密码
|
67
|
+
:param host: 数据库主机
|
68
|
+
:param port: 数据库端口
|
69
|
+
:param charset: 字符集,默认 utf8mb4
|
70
|
+
"""
|
71
|
+
self.username: str = username
|
72
|
+
self.password: str = password
|
73
|
+
self.host: str = host
|
74
|
+
self.port: int = port
|
63
75
|
if username == '' or password == '' or host == '' or port == 0:
|
64
|
-
self.config = None
|
76
|
+
self.config: dict | None = None
|
65
77
|
else:
|
66
|
-
self.config = {
|
78
|
+
self.config: dict = {
|
67
79
|
'host': self.host,
|
68
80
|
'port': int(self.port),
|
69
81
|
'user': self.username,
|
70
82
|
'password': self.password,
|
71
|
-
'charset': charset,
|
83
|
+
'charset': charset,
|
72
84
|
'cursorclass': pymysql.cursors.DictCursor,
|
73
85
|
}
|
74
|
-
self.filename = None
|
86
|
+
self.filename: str | None = None
|
75
87
|
|
76
88
|
@staticmethod
|
77
|
-
def try_except(func):
|
78
|
-
|
89
|
+
def try_except(func):
|
90
|
+
"""
|
91
|
+
装饰器:捕获并记录函数异常。
|
92
|
+
"""
|
79
93
|
@wraps(func)
|
80
94
|
def wrapper(*args, **kwargs):
|
81
95
|
try:
|
82
96
|
return func(*args, **kwargs)
|
83
97
|
except Exception as e:
|
84
|
-
logger.error(f'{func.__name__}, {e}')
|
85
|
-
|
98
|
+
logger.error(f'{func.__name__}, {e}')
|
86
99
|
return wrapper
|
87
100
|
|
88
|
-
def keep_connect(self, _db_name, _config, max_try: int=10):
|
101
|
+
def keep_connect(self, _db_name: str, _config: dict, max_try: int = 10) -> pymysql.connections.Connection | None:
|
102
|
+
"""
|
103
|
+
保持数据库连接,失败自动重试。
|
104
|
+
:param _db_name: 数据库名
|
105
|
+
:param _config: 连接配置
|
106
|
+
:param max_try: 最大重试次数
|
107
|
+
:return: 数据库连接对象或 None
|
108
|
+
"""
|
89
109
|
attempts = 1
|
90
110
|
while attempts <= max_try:
|
91
111
|
try:
|
92
|
-
connection = pymysql.connect(**_config)
|
112
|
+
connection = pymysql.connect(**_config)
|
93
113
|
return connection
|
94
114
|
except Exception as e:
|
95
115
|
logger.error(f'{_db_name}: 连接失败,正在重试: {self.host}:{self.port} {attempts}/{max_try} {e}')
|
@@ -98,13 +118,17 @@ class MysqlUpload:
|
|
98
118
|
logger.error(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
|
99
119
|
return None
|
100
120
|
|
101
|
-
def cover_doc_dtypes(self, dict_data):
|
102
|
-
"""
|
121
|
+
def cover_doc_dtypes(self, dict_data: dict) -> tuple[dict, dict] | None:
|
122
|
+
"""
|
123
|
+
清理字典键值并推断数据类型。
|
124
|
+
:param dict_data: 原始数据字典
|
125
|
+
:return: (字段类型字典, 清理后的数据字典)
|
126
|
+
"""
|
103
127
|
if not dict_data:
|
104
128
|
logger.info(f'mysql.py -> MysqlUpload -> cover_dict_dtypes -> 传入的字典不能为空')
|
105
|
-
return
|
106
|
-
__res_dict = {}
|
107
|
-
new_dict_data = {}
|
129
|
+
return None
|
130
|
+
__res_dict: dict = {}
|
131
|
+
new_dict_data: dict = {}
|
108
132
|
for k, v in dict_data.items():
|
109
133
|
k = str(k).lower()
|
110
134
|
k = re.sub(r'[()\-,,$&~^、 ()\"\'“”=·/。》《><!!`]', '_', k, re.IGNORECASE)
|
@@ -115,21 +139,20 @@ class MysqlUpload:
|
|
115
139
|
result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
|
116
140
|
result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
|
117
141
|
result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
if result1: # 京东sku/spu商品信息
|
142
|
+
date_type = otk.is_valid_date(v)
|
143
|
+
int_num = otk.is_integer(v)
|
144
|
+
count_int, count_float = count_decimal_places(v)
|
145
|
+
if result1:
|
123
146
|
__res_dict.update({k: 'varchar(100)'})
|
124
147
|
elif k == '日期':
|
125
148
|
__res_dict.update({k: 'DATE'})
|
126
149
|
elif k == '更新时间':
|
127
150
|
__res_dict.update({k: 'TIMESTAMP'})
|
128
|
-
elif result2:
|
151
|
+
elif result2:
|
129
152
|
__res_dict.update({k: 'decimal(10,4)'})
|
130
|
-
elif date_type == 1:
|
153
|
+
elif date_type == 1:
|
131
154
|
__res_dict.update({k: 'DATE'})
|
132
|
-
elif date_type == 2:
|
155
|
+
elif date_type == 2:
|
133
156
|
__res_dict.update({k: 'DATETIME'})
|
134
157
|
elif int_num:
|
135
158
|
__res_dict.update({k: 'INT'})
|
@@ -152,14 +175,17 @@ class MysqlUpload:
|
|
152
175
|
return __res_dict, new_dict_data
|
153
176
|
|
154
177
|
@try_except
|
155
|
-
def insert_many_dict(self, db_name, table_name, dict_data_list, icm_update=None, index_length=100, set_typ=None, allow_not_null=False, cut_data=None):
|
178
|
+
def insert_many_dict(self, db_name: str, table_name: str, dict_data_list: list[dict], icm_update: list[str] = None, index_length: int = 100, set_typ: dict = None, allow_not_null: bool = False, cut_data: str = None) -> None:
|
156
179
|
"""
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
180
|
+
批量插入字典数据,自动建表、补全字段、类型推断。
|
181
|
+
:param db_name: 数据库名
|
182
|
+
:param table_name: 表名
|
183
|
+
:param dict_data_list: 字典数据列表
|
184
|
+
:param icm_update: 增量更新主键列名列表
|
185
|
+
:param index_length: 索引长度
|
186
|
+
:param set_typ: 自定义字段类型
|
187
|
+
:param allow_not_null: 是否允许字段为NULL
|
188
|
+
:param cut_data: 按年或月分表
|
163
189
|
"""
|
164
190
|
if not self.config:
|
165
191
|
return
|
@@ -321,14 +347,17 @@ class MysqlUpload:
|
|
321
347
|
connection.close()
|
322
348
|
|
323
349
|
# @try_except
|
324
|
-
def dict_to_mysql(self, db_name, table_name, dict_data, icm_update=None, index_length=100, set_typ=None, allow_not_null=False, cut_data=None):
|
350
|
+
def dict_to_mysql(self, db_name: str, table_name: str, dict_data: dict, icm_update: list[str] = None, index_length: int = 100, set_typ: dict = None, allow_not_null: bool = False, cut_data: str = None) -> None:
|
325
351
|
"""
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
352
|
+
插入单条字典数据,自动建表、补全字段、类型推断。
|
353
|
+
:param db_name: 数据库名
|
354
|
+
:param table_name: 表名
|
355
|
+
:param dict_data: 单条字典数据
|
356
|
+
:param icm_update: 增量更新主键列名列表
|
357
|
+
:param index_length: 索引长度
|
358
|
+
:param set_typ: 自定义字段类型
|
359
|
+
:param allow_not_null: 是否允许字段为NULL
|
360
|
+
:param cut_data: 按年或月分表
|
332
361
|
"""
|
333
362
|
if not self.config:
|
334
363
|
return
|
@@ -394,6 +423,7 @@ class MysqlUpload:
|
|
394
423
|
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]};"
|
395
424
|
else:
|
396
425
|
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
|
426
|
+
|
397
427
|
cursor.execute(sql)
|
398
428
|
logger.info(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
|
399
429
|
|
@@ -402,80 +432,93 @@ class MysqlUpload:
|
|
402
432
|
logger.info(f"设置为索引: {col}({dtypes[col]})")
|
403
433
|
cursor.execute(sql)
|
404
434
|
connection.commit() # 提交事务
|
405
|
-
|
406
|
-
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
435
|
+
|
407
436
|
# 处理插入的数据
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
condition_parts = []
|
420
|
-
for up_col in icm_update:
|
421
|
-
condition_parts.append(f"`{up_col}` = %s") # SQL 转义
|
422
|
-
condition_params.append(dict_data[up_col]) # 原始列名访问数据
|
423
|
-
|
424
|
-
# 动态转义列名生成 SQL 查询字段
|
425
|
-
escaped_update_col = [f'`{col}`' for col in raw_update_col]
|
426
|
-
sql = f"""SELECT {','.join(escaped_update_col)} FROM `{table_name}` WHERE {' AND '.join(condition_parts)}"""
|
427
|
-
cursor.execute(sql, condition_params)
|
428
|
-
results = cursor.fetchall()
|
437
|
+
datas = [dict_data]
|
438
|
+
for dict_data in datas:
|
439
|
+
dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
|
440
|
+
if icm_update:
|
441
|
+
""" 使用增量更新: 需确保 icm_update['主键'] 传进来的列组合是数据表中唯一,值不会发生变化且不会重复,否则可能产生覆盖 """
|
442
|
+
sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
|
443
|
+
cursor.execute(sql, (db_name, table_name))
|
444
|
+
columns = cursor.fetchall()
|
445
|
+
cols_exist = [col['COLUMN_NAME'] for col in columns] # 数据表的所有列, 返回 list
|
446
|
+
# 保留原始列名,不提前转义
|
447
|
+
raw_update_col = [item for item in cols_exist if item not in icm_update and item != 'id'] # 除了主键外的其他列
|
429
448
|
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
cursor.execute(sql, tuple(dict_data.values()))
|
464
|
-
connection.commit() # 提交数据库
|
465
|
-
connection.close()
|
466
|
-
return
|
449
|
+
# 构建条件参数(使用原始列名)
|
450
|
+
condition_params = []
|
451
|
+
condition_parts = []
|
452
|
+
for up_col in icm_update:
|
453
|
+
condition_parts.append(f"`{up_col}` = %s") # SQL 转义
|
454
|
+
condition_params.append(dict_data[up_col]) # 原始列名用于访问数据
|
455
|
+
|
456
|
+
# 动态转义列名生成 SQL 查询字段
|
457
|
+
escaped_update_col = [f'`{col}`' for col in raw_update_col]
|
458
|
+
sql = f"""SELECT {','.join(escaped_update_col)} FROM `{table_name}` WHERE {' AND '.join(condition_parts)}"""
|
459
|
+
cursor.execute(sql, condition_params)
|
460
|
+
results = cursor.fetchall()
|
461
|
+
|
462
|
+
if results:
|
463
|
+
for result in results:
|
464
|
+
change_col = []
|
465
|
+
change_placeholders = []
|
466
|
+
set_params = []
|
467
|
+
for raw_col in raw_update_col:
|
468
|
+
# 使用原始列名访问数据
|
469
|
+
df_value = str(dict_data[raw_col])
|
470
|
+
mysql_value = str(result[raw_col])
|
471
|
+
|
472
|
+
# 清理小数点后多余的零
|
473
|
+
if '.' in df_value:
|
474
|
+
df_value = re.sub(r'0+$', '', df_value).rstrip('.')
|
475
|
+
if '.' in mysql_value:
|
476
|
+
mysql_value = re.sub(r'0+$', '', mysql_value).rstrip('.')
|
477
|
+
|
478
|
+
if df_value != mysql_value:
|
479
|
+
change_placeholders.append(f"`{raw_col}` = %s") # 动态转义列名
|
480
|
+
set_params.append(dict_data[raw_col])
|
481
|
+
change_col.append(raw_col)
|
467
482
|
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
483
|
+
if change_placeholders:
|
484
|
+
full_params = set_params + condition_params
|
485
|
+
sql = f"""UPDATE `{table_name}`
|
486
|
+
SET {','.join(change_placeholders)}
|
487
|
+
WHERE {' AND '.join(condition_parts)}"""
|
488
|
+
cursor.execute(sql, full_params)
|
489
|
+
else: # 没有数据返回,则直接插入数据
|
490
|
+
# 参数化插入
|
491
|
+
cols = ', '.join([f'`{k}`' for k in dict_data.keys()])
|
492
|
+
placeholders = ', '.join(['%s'] * len(dict_data))
|
493
|
+
sql = f"INSERT INTO `{table_name}` ({cols}) VALUES ({placeholders})"
|
494
|
+
cursor.execute(sql, tuple(dict_data.values()))
|
495
|
+
connection.commit() # 提交数据库
|
496
|
+
continue
|
497
|
+
|
498
|
+
# 标准插入逻辑(参数化修改)
|
499
|
+
# 构造更新列(排除主键)
|
500
|
+
update_cols = [k for k in dict_data.keys()]
|
501
|
+
# 构建SQL
|
502
|
+
cols = ', '.join([f'`{k}`' for k in dict_data.keys()])
|
503
|
+
placeholders = ', '.join(['%s'] * len(dict_data))
|
504
|
+
update_clause = ', '.join([f'`{k}` = VALUES(`{k}`)' for k in update_cols]) or 'id=id'
|
505
|
+
|
506
|
+
sql = f"""INSERT INTO `{table_name}` ({cols}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause}"""
|
507
|
+
# 执行参数化查询
|
508
|
+
try:
|
509
|
+
cursor.execute(sql, tuple(dict_data.values()))
|
510
|
+
connection.commit()
|
511
|
+
except pymysql.Error as e:
|
512
|
+
logger.error(f"插入失败: {e}\nSQL: {cursor.mogrify(sql, tuple(dict_data.values()))}")
|
513
|
+
connection.rollback()
|
475
514
|
connection.close()
|
476
515
|
|
477
|
-
def cover_dict_dtypes(self, dict_data):
|
478
|
-
"""
|
516
|
+
def cover_dict_dtypes(self, dict_data: dict) -> tuple[dict, dict] | None:
|
517
|
+
"""
|
518
|
+
清理字典键值并推断数据类型。
|
519
|
+
:param dict_data: 原始数据字典
|
520
|
+
:return: (字段类型字典, 清理后的数据字典)
|
521
|
+
"""
|
479
522
|
if not dict_data:
|
480
523
|
logger.info(f'mysql.py -> MysqlUpload -> cover_dict_dtypes -> 传入的字典不能为空')
|
481
524
|
return
|
@@ -536,8 +579,12 @@ class MysqlUpload:
|
|
536
579
|
new_dict_data.update({k: v})
|
537
580
|
return __res_dict, new_dict_data
|
538
581
|
|
539
|
-
def convert_df_dtypes(self, df: pd.DataFrame):
|
540
|
-
"""
|
582
|
+
def convert_df_dtypes(self, df: pd.DataFrame) -> tuple[dict, pd.DataFrame]:
|
583
|
+
"""
|
584
|
+
清理 DataFrame 的值和列名,并推断数据类型。
|
585
|
+
:param df: 原始 DataFrame
|
586
|
+
:return: (字段类型字典, 清理后的 DataFrame)
|
587
|
+
"""
|
541
588
|
df = otk.cover_df(df=df) # 清理 df 的值和列名
|
542
589
|
[pd.to_numeric(df[col], errors='ignore') for col in df.columns.tolist()]
|
543
590
|
dtypes = df.dtypes.to_dict()
|
@@ -573,18 +620,20 @@ class MysqlUpload:
|
|
573
620
|
return __res_dict, df
|
574
621
|
|
575
622
|
@try_except
|
576
|
-
def df_to_mysql(self, df, db_name, table_name, set_typ=None, icm_update=[], move_insert=False, df_sql=False,
|
577
|
-
filename=None, count=None, allow_not_null=False, cut_data=None):
|
623
|
+
def df_to_mysql(self, df: pd.DataFrame, db_name: str, table_name: str, set_typ: dict = None, icm_update: list[str] = [], move_insert: bool = False, df_sql: bool = False, filename: str = None, count: int = None, allow_not_null: bool = False, cut_data: str = None) -> None:
|
578
624
|
"""
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
icm_update:
|
585
|
-
|
586
|
-
|
587
|
-
|
625
|
+
DataFrame 批量上传到 MySQL,自动建表、补全字段、类型推断。
|
626
|
+
:param df: DataFrame
|
627
|
+
:param db_name: 数据库名
|
628
|
+
:param table_name: 表名
|
629
|
+
:param set_typ: 自定义字段类型
|
630
|
+
:param icm_update: 增量更新主键列名列表
|
631
|
+
:param move_insert: 是否先删除再插入
|
632
|
+
:param df_sql: 是否用 to_sql 批量插入
|
633
|
+
:param filename: 进度追踪文件名
|
634
|
+
:param count: 进度计数
|
635
|
+
:param allow_not_null: 是否允许字段为NULL
|
636
|
+
:param cut_data: 按年或月分表
|
588
637
|
"""
|
589
638
|
if not self.config:
|
590
639
|
return
|
@@ -659,7 +708,7 @@ class MysqlUpload:
|
|
659
708
|
cursor.execute(create_table_sql)
|
660
709
|
logger.info(f'创建 mysql 表: {table_name}')
|
661
710
|
|
662
|
-
#
|
711
|
+
# 有特殊字符不需转义
|
663
712
|
sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
|
664
713
|
cursor.execute(sql, (db_name, table_name))
|
665
714
|
col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()]
|
@@ -809,52 +858,67 @@ class MysqlUpload:
|
|
809
858
|
|
810
859
|
class OptimizeDatas:
|
811
860
|
"""
|
812
|
-
|
813
|
-
|
861
|
+
数据维护类:用于删除 MySQL 的冗余数据。
|
862
|
+
更新过程:
|
814
863
|
1. 读取所有数据表
|
815
|
-
2.
|
816
|
-
3.
|
817
|
-
tips:
|
864
|
+
2. 遍历表,遍历列,如果存在日期列则按天遍历所有日期,不存在则全表读取
|
865
|
+
3. 按天删除所有冗余数据(存在日期列时)
|
866
|
+
tips: 查找冗余数据的方式是创建一个临时迭代器,逐行读取数据并添加到迭代器,出现重复时将重复数据的 id 添加到临时列表,按列表 id 执行删除
|
818
867
|
"""
|
819
868
|
def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
869
|
+
"""
|
870
|
+
初始化 OptimizeDatas 实例。
|
871
|
+
:param username: 数据库用户名
|
872
|
+
:param password: 数据库密码
|
873
|
+
:param host: 数据库主机
|
874
|
+
:param port: 数据库端口
|
875
|
+
:param charset: 字符集,默认 utf8mb4
|
876
|
+
"""
|
877
|
+
self.username: str = username
|
878
|
+
self.password: str = password
|
879
|
+
self.host: str = host
|
880
|
+
self.port: int = port
|
881
|
+
self.charset: str = charset
|
882
|
+
self.config: dict = {
|
826
883
|
'host': self.host,
|
827
884
|
'port': int(self.port),
|
828
885
|
'user': self.username,
|
829
886
|
'password': self.password,
|
830
|
-
'charset': self.charset,
|
887
|
+
'charset': self.charset,
|
831
888
|
'cursorclass': pymysql.cursors.DictCursor,
|
832
889
|
}
|
833
|
-
self.db_name_lists: list = [] #
|
834
|
-
self.db_name = None
|
890
|
+
self.db_name_lists: list[str] = [] # 需要优化的数据库名列表
|
891
|
+
self.db_name: str | None = None
|
835
892
|
self.days: int = 63 # 对近 N 天的数据进行排重
|
836
|
-
self.end_date = None
|
837
|
-
self.start_date = None
|
838
|
-
self.connection = None
|
893
|
+
self.end_date: pd.Timestamp | None = None
|
894
|
+
self.start_date: pd.Timestamp | None = None
|
895
|
+
self.connection: pymysql.connections.Connection | None = None
|
839
896
|
|
840
897
|
@staticmethod
|
841
|
-
def try_except(func):
|
842
|
-
|
898
|
+
def try_except(func):
|
899
|
+
"""
|
900
|
+
装饰器:捕获并记录函数异常。
|
901
|
+
"""
|
843
902
|
@wraps(func)
|
844
903
|
def wrapper(*args, **kwargs):
|
845
904
|
try:
|
846
905
|
return func(*args, **kwargs)
|
847
906
|
except Exception as e:
|
848
|
-
logger.error(f'{func.__name__}, {e}')
|
849
|
-
|
907
|
+
logger.error(f'{func.__name__}, {e}')
|
850
908
|
return wrapper
|
851
909
|
|
852
|
-
def keep_connect(self, _db_name, _config, max_try: int=10):
|
910
|
+
def keep_connect(self, _db_name: str, _config: dict, max_try: int = 10) -> pymysql.connections.Connection | None:
|
911
|
+
"""
|
912
|
+
保持数据库连接,失败自动重试。
|
913
|
+
:param _db_name: 数据库名
|
914
|
+
:param _config: 连接配置
|
915
|
+
:param max_try: 最大重试次数
|
916
|
+
:return: 数据库连接对象或 None
|
917
|
+
"""
|
853
918
|
attempts = 1
|
854
919
|
while attempts <= max_try:
|
855
920
|
try:
|
856
|
-
|
857
|
-
return connection
|
921
|
+
return pymysql.connect(**_config)
|
858
922
|
except Exception as e:
|
859
923
|
logger.error(f'{_db_name}连接失败,正在重试: {self.host}:{self.port} {attempts}/{max_try} {e}')
|
860
924
|
attempts += 1
|
@@ -862,10 +926,10 @@ class OptimizeDatas:
|
|
862
926
|
logger.error(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
|
863
927
|
return None
|
864
928
|
|
865
|
-
def optimize_list(self):
|
929
|
+
def optimize_list(self) -> None:
|
866
930
|
"""
|
867
|
-
|
868
|
-
需要设置 self.db_name_lists
|
931
|
+
批量优化多个数据库,移除冗余数据。
|
932
|
+
需要设置 self.db_name_lists。
|
869
933
|
"""
|
870
934
|
if not self.db_name_lists:
|
871
935
|
logger.info(f'尚未设置参数: self.db_name_lists')
|
@@ -874,8 +938,11 @@ class OptimizeDatas:
|
|
874
938
|
self.db_name = db_name
|
875
939
|
self.optimize()
|
876
940
|
|
877
|
-
def optimize(self, except_key=['更新时间']):
|
878
|
-
"""
|
941
|
+
def optimize(self, except_key: list[str] = ['更新时间']) -> None:
|
942
|
+
"""
|
943
|
+
优化当前数据库,移除冗余数据。
|
944
|
+
:param except_key: 排除的字段名列表,默认['更新时间']
|
945
|
+
"""
|
879
946
|
if not self.db_name:
|
880
947
|
logger.info(f'尚未设置参数: self.db_name')
|
881
948
|
return
|
@@ -883,7 +950,6 @@ class OptimizeDatas:
|
|
883
950
|
if not tables:
|
884
951
|
logger.info(f'{self.db_name} -> 数据表不存在')
|
885
952
|
return
|
886
|
-
|
887
953
|
# 日期初始化
|
888
954
|
if not self.end_date:
|
889
955
|
self.end_date = pd.to_datetime(datetime.datetime.today())
|
@@ -897,212 +963,202 @@ class OptimizeDatas:
|
|
897
963
|
self.start_date = pd.to_datetime(self.start_date)
|
898
964
|
start_date_before = self.start_date
|
899
965
|
end_date_before = self.end_date
|
900
|
-
|
901
966
|
logger.info(f'mysql({self.host}: {self.port}) {self.db_name} 数据库优化中(日期长度: {self.days} 天)...')
|
902
967
|
for table_dict in tables:
|
903
|
-
for
|
904
|
-
self.config.update({'database': self.db_name})
|
905
|
-
self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=
|
968
|
+
for _, table_name in table_dict.items():
|
969
|
+
self.config.update({'database': self.db_name})
|
970
|
+
self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=5)
|
906
971
|
if not self.connection:
|
907
|
-
|
972
|
+
continue
|
908
973
|
with self.connection.cursor() as cursor:
|
909
|
-
|
910
|
-
cursor.
|
911
|
-
result = cursor.fetchone()
|
912
|
-
if not result:
|
974
|
+
cursor.execute(f"SELECT 1 FROM `{table_name}` LIMIT 1")
|
975
|
+
if not cursor.fetchone():
|
913
976
|
logger.info(f'数据表: {table_name}, 数据长度为 0')
|
914
|
-
continue
|
915
|
-
|
916
|
-
cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`") # 查询数据表的列信息
|
977
|
+
continue
|
978
|
+
cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`")
|
917
979
|
columns = cursor.fetchall()
|
918
|
-
date_exist =
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
cursor.execute(sql_min)
|
929
|
-
min_result = cursor.fetchone()
|
930
|
-
# 匹配修改为合适的起始和结束日期
|
931
|
-
if self.start_date < pd.to_datetime(min_result['min_date']):
|
932
|
-
self.start_date = pd.to_datetime(min_result['min_date'])
|
933
|
-
if self.end_date > pd.to_datetime(max_result['max_date']):
|
934
|
-
self.end_date = pd.to_datetime(max_result['max_date'])
|
980
|
+
date_exist = any(col['Field'] == '日期' and (col['Type'] == 'date' or col['Type'].startswith('datetime')) for col in columns)
|
981
|
+
if date_exist:
|
982
|
+
cursor.execute(f"SELECT MAX(日期) AS max_date, MIN(日期) AS min_date FROM `{table_name}`")
|
983
|
+
result = cursor.fetchone()
|
984
|
+
min_date = result['min_date']
|
985
|
+
max_date = result['max_date']
|
986
|
+
if min_date and self.start_date < pd.to_datetime(min_date):
|
987
|
+
self.start_date = pd.to_datetime(min_date)
|
988
|
+
if max_date and self.end_date > pd.to_datetime(max_date):
|
989
|
+
self.end_date = pd.to_datetime(max_date)
|
935
990
|
dates_list = self.day_list(start_date=self.start_date, end_date=self.end_date)
|
936
|
-
# dates_list 是日期列表
|
937
991
|
for date in dates_list:
|
938
992
|
self.delete_duplicate(table_name=table_name, date=date, except_key=except_key)
|
939
|
-
self.start_date = start_date_before
|
993
|
+
self.start_date = start_date_before
|
940
994
|
self.end_date = end_date_before
|
941
|
-
else:
|
995
|
+
else:
|
942
996
|
self.delete_duplicate2(table_name=table_name, except_key=except_key)
|
943
997
|
self.connection.close()
|
944
998
|
logger.info(f'mysql({self.host}: {self.port}) {self.db_name} 数据库优化完成!')
|
945
999
|
|
946
|
-
def delete_duplicate(self, table_name, date, except_key=['更新时间']):
|
1000
|
+
def delete_duplicate(self, table_name: str, date: pd.Timestamp, except_key: list[str] = ['更新时间']) -> None:
|
1001
|
+
"""
|
1002
|
+
删除指定表指定日期的冗余数据。
|
1003
|
+
:param table_name: 表名
|
1004
|
+
:param date: 日期
|
1005
|
+
:param except_key: 排除的字段名列表
|
1006
|
+
"""
|
947
1007
|
datas = self.table_datas(db_name=self.db_name, table_name=str(table_name), date=date)
|
948
1008
|
if not datas:
|
949
1009
|
return
|
950
|
-
duplicate_id =
|
951
|
-
all_datas =
|
1010
|
+
duplicate_id: set = set()
|
1011
|
+
all_datas: set = set()
|
952
1012
|
for data in datas:
|
953
|
-
for
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
delete_id
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
|
963
|
-
continue
|
964
|
-
all_datas.append(data) # 数据没有重复
|
965
|
-
except Exception as e:
|
966
|
-
logger.debug(f'{table_name} 函数: mysql - > OptimizeDatas -> delete_duplicate -> {e}')
|
967
|
-
del all_datas
|
968
|
-
|
969
|
-
if not duplicate_id: # 如果没有重复数据,则跳过该数据表
|
1013
|
+
data_no_id = {k: v for k, v in data.items() if k not in except_key and k != 'id'}
|
1014
|
+
data_tuple = tuple(sorted(data_no_id.items()))
|
1015
|
+
delete_id = data.get('id')
|
1016
|
+
if data_tuple in all_datas:
|
1017
|
+
if delete_id:
|
1018
|
+
duplicate_id.add(delete_id)
|
1019
|
+
else:
|
1020
|
+
all_datas.add(data_tuple)
|
1021
|
+
if not duplicate_id:
|
970
1022
|
return
|
971
|
-
|
972
1023
|
try:
|
973
1024
|
with self.connection.cursor() as cursor:
|
974
1025
|
placeholders = ', '.join(['%s'] * len(duplicate_id))
|
975
|
-
# 移除冗余数据
|
976
1026
|
sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
|
977
|
-
cursor.execute(sql, duplicate_id)
|
1027
|
+
cursor.execute(sql, list(duplicate_id))
|
978
1028
|
logger.debug(f"{table_name} -> {date.strftime('%Y-%m-%d')} before: {len(datas)}, remove: {cursor.rowcount}")
|
979
|
-
self.connection.commit()
|
1029
|
+
self.connection.commit()
|
980
1030
|
except Exception as e:
|
981
1031
|
logger.error(f'{self.db_name}/{table_name}, {e}')
|
982
|
-
self.connection.rollback()
|
1032
|
+
self.connection.rollback()
|
983
1033
|
|
984
|
-
def delete_duplicate2(self, table_name, except_key=['更新时间']):
|
1034
|
+
def delete_duplicate2(self, table_name: str, except_key: list[str] = ['更新时间']) -> None:
|
1035
|
+
"""
|
1036
|
+
删除指定表(无日期列)的冗余数据。
|
1037
|
+
:param table_name: 表名
|
1038
|
+
:param except_key: 排除的字段名列表
|
1039
|
+
"""
|
985
1040
|
with self.connection.cursor() as cursor:
|
986
|
-
|
987
|
-
cursor.execute(sql)
|
1041
|
+
cursor.execute(f"SELECT * FROM `{table_name}`")
|
988
1042
|
datas = cursor.fetchall()
|
989
1043
|
if not datas:
|
990
1044
|
return
|
991
|
-
duplicate_id =
|
992
|
-
all_datas =
|
1045
|
+
duplicate_id: set = set()
|
1046
|
+
all_datas: set = set()
|
993
1047
|
for data in datas:
|
994
|
-
for
|
995
|
-
|
996
|
-
|
997
|
-
|
998
|
-
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1002
|
-
|
1003
|
-
all_datas.append(data) # 数据没有重复
|
1004
|
-
del all_datas
|
1005
|
-
|
1006
|
-
if not duplicate_id: # 如果没有重复数据,则跳过该数据表
|
1048
|
+
data_no_id = {k: v for k, v in data.items() if k not in except_key and k != 'id'}
|
1049
|
+
data_tuple = tuple(sorted(data_no_id.items()))
|
1050
|
+
delete_id = data.get('id')
|
1051
|
+
if data_tuple in all_datas:
|
1052
|
+
if delete_id:
|
1053
|
+
duplicate_id.add(delete_id)
|
1054
|
+
else:
|
1055
|
+
all_datas.add(data_tuple)
|
1056
|
+
if not duplicate_id:
|
1007
1057
|
return
|
1008
|
-
|
1009
1058
|
try:
|
1010
1059
|
with self.connection.cursor() as cursor:
|
1011
1060
|
placeholders = ', '.join(['%s'] * len(duplicate_id))
|
1012
|
-
# 移除冗余数据
|
1013
1061
|
sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
|
1014
|
-
cursor.execute(sql, duplicate_id)
|
1015
|
-
logger.info(f"{table_name} -> before: {len(datas)}, "
|
1016
|
-
|
1017
|
-
self.connection.commit() # 提交事务
|
1062
|
+
cursor.execute(sql, list(duplicate_id))
|
1063
|
+
logger.info(f"{table_name} -> before: {len(datas)}, remove: {cursor.rowcount}")
|
1064
|
+
self.connection.commit()
|
1018
1065
|
except Exception as e:
|
1019
1066
|
logger.error(f'{self.db_name}/{table_name}, {e}')
|
1020
|
-
self.connection.rollback()
|
1067
|
+
self.connection.rollback()
|
1021
1068
|
|
1022
|
-
def database_list(self):
|
1023
|
-
"""
|
1024
|
-
|
1069
|
+
def database_list(self) -> list[dict] | None:
|
1070
|
+
"""
|
1071
|
+
获取所有数据库名。
|
1072
|
+
:return: 数据库名列表
|
1073
|
+
"""
|
1074
|
+
connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=5)
|
1025
1075
|
if not connection:
|
1026
|
-
return
|
1076
|
+
return None
|
1027
1077
|
with connection.cursor() as cursor:
|
1028
1078
|
cursor.execute("SHOW DATABASES")
|
1029
|
-
databases = cursor.fetchall()
|
1079
|
+
databases = cursor.fetchall()
|
1030
1080
|
connection.close()
|
1031
1081
|
return databases
|
1032
1082
|
|
1033
|
-
def table_list(self, db_name):
|
1034
|
-
"""
|
1035
|
-
|
1083
|
+
def table_list(self, db_name: str) -> list[dict] | None:
|
1084
|
+
"""
|
1085
|
+
获取指定数据库的所有数据表。
|
1086
|
+
:param db_name: 数据库名
|
1087
|
+
:return: 数据表名列表
|
1088
|
+
"""
|
1089
|
+
connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=5)
|
1036
1090
|
if not connection:
|
1037
|
-
return
|
1091
|
+
return None
|
1038
1092
|
try:
|
1039
1093
|
with connection.cursor() as cursor:
|
1040
|
-
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'")
|
1094
|
+
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'")
|
1041
1095
|
database_exists = cursor.fetchone()
|
1042
1096
|
if not database_exists:
|
1043
1097
|
logger.info(f'{db_name}: 数据表不存在!')
|
1044
|
-
return
|
1098
|
+
return None
|
1045
1099
|
except Exception as e:
|
1046
1100
|
logger.error(f'002 {e}')
|
1047
|
-
return
|
1101
|
+
return None
|
1048
1102
|
finally:
|
1049
|
-
connection.close()
|
1050
|
-
|
1051
|
-
self.
|
1052
|
-
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
1103
|
+
connection.close()
|
1104
|
+
self.config.update({'database': db_name})
|
1105
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
|
1053
1106
|
if not connection:
|
1054
|
-
return
|
1107
|
+
return None
|
1055
1108
|
with connection.cursor() as cursor:
|
1056
1109
|
cursor.execute("SHOW TABLES")
|
1057
|
-
tables = cursor.fetchall()
|
1110
|
+
tables = cursor.fetchall()
|
1058
1111
|
connection.close()
|
1059
1112
|
return tables
|
1060
1113
|
|
1061
|
-
def table_datas(self, db_name, table_name, date):
|
1114
|
+
def table_datas(self, db_name: str, table_name: str, date: pd.Timestamp) -> list[dict]:
|
1062
1115
|
"""
|
1063
|
-
|
1116
|
+
获取指定表指定日期的数据。
|
1117
|
+
:param db_name: 数据库名
|
1118
|
+
:param table_name: 表名
|
1119
|
+
:param date: 日期
|
1120
|
+
:return: 数据列表
|
1064
1121
|
"""
|
1065
|
-
self.config.update({'database': db_name})
|
1066
|
-
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=
|
1122
|
+
self.config.update({'database': db_name})
|
1123
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
|
1067
1124
|
if not connection:
|
1068
|
-
return
|
1125
|
+
return []
|
1069
1126
|
try:
|
1070
1127
|
with connection.cursor() as cursor:
|
1071
|
-
sql = f"SELECT * FROM `{table_name}` WHERE
|
1072
|
-
cursor.execute(sql)
|
1128
|
+
sql = f"SELECT * FROM `{table_name}` WHERE 日期 = %s"
|
1129
|
+
cursor.execute(sql, (date,))
|
1073
1130
|
results = cursor.fetchall()
|
1074
1131
|
except Exception as e:
|
1075
1132
|
logger.error(f'001 {e}')
|
1133
|
+
results = []
|
1076
1134
|
finally:
|
1077
1135
|
connection.close()
|
1078
1136
|
return results
|
1079
1137
|
|
1080
|
-
def day_list(self, start_date, end_date):
|
1138
|
+
def day_list(self, start_date: pd.Timestamp, end_date: pd.Timestamp) -> list[pd.Timestamp]:
|
1139
|
+
"""
|
1140
|
+
生成日期范围内的所有日期列表。
|
1141
|
+
:param start_date: 起始日期
|
1142
|
+
:param end_date: 结束日期
|
1143
|
+
:return: 日期列表
|
1144
|
+
"""
|
1081
1145
|
start_date = pd.to_datetime(start_date)
|
1082
1146
|
end_date = pd.to_datetime(end_date)
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
start_date += datetime.timedelta(days=1)
|
1087
|
-
return date_list
|
1088
|
-
|
1089
|
-
def rename_column(self):
|
1090
|
-
""" 批量修改数据库的列名 """
|
1147
|
+
return [start_date + datetime.timedelta(days=i) for i in range((end_date - start_date).days + 1)]
|
1148
|
+
|
1149
|
+
def rename_column(self) -> None:
|
1091
1150
|
"""
|
1092
|
-
|
1093
|
-
# s = OptimizeDatas(username=username, password=password, host=host, port=port)
|
1094
|
-
# s.db_name = db_name
|
1095
|
-
# s.rename_column()
|
1151
|
+
批量修改数据库的列名,去除结尾的下划线。
|
1096
1152
|
"""
|
1097
1153
|
tables = self.table_list(db_name=self.db_name)
|
1098
|
-
for table_dict in tables:
|
1099
|
-
for
|
1100
|
-
self.config.update({'database': self.db_name})
|
1101
|
-
self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=
|
1154
|
+
for table_dict in tables or []:
|
1155
|
+
for _, table_name in table_dict.items():
|
1156
|
+
self.config.update({'database': self.db_name})
|
1157
|
+
self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=5)
|
1102
1158
|
if not self.connection:
|
1103
|
-
|
1159
|
+
continue
|
1104
1160
|
with self.connection.cursor() as cursor:
|
1105
|
-
cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`")
|
1161
|
+
cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`")
|
1106
1162
|
columns = cursor.fetchall()
|
1107
1163
|
columns = [{column['Field']: column['Type']} for column in columns]
|
1108
1164
|
for column in columns:
|