mdbq 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/mysql/mysql_bak.py +1808 -0
- mdbq/mysql/s_query.py +12 -3
- mdbq/redis/getredis.py +409 -2
- {mdbq-3.6.7.dist-info → mdbq-3.6.9.dist-info}/METADATA +1 -1
- {mdbq-3.6.7.dist-info → mdbq-3.6.9.dist-info}/RECORD +7 -8
- mdbq/redis/getredis_bak20250131.py +0 -265
- mdbq/redis/getredis_deepseek.py +0 -235
- {mdbq-3.6.7.dist-info → mdbq-3.6.9.dist-info}/WHEEL +0 -0
- {mdbq-3.6.7.dist-info → mdbq-3.6.9.dist-info}/top_level.txt +0 -0
mdbq/mysql/mysql_bak.py
ADDED
@@ -0,0 +1,1808 @@
|
|
1
|
+
# -*- coding:utf-8 -*-
|
2
|
+
import datetime
|
3
|
+
import platform
|
4
|
+
import getpass
|
5
|
+
import re
|
6
|
+
import time
|
7
|
+
from fileinput import filename
|
8
|
+
from functools import wraps
|
9
|
+
import warnings
|
10
|
+
import pymysql
|
11
|
+
import numpy as np
|
12
|
+
import pandas as pd
|
13
|
+
from sqlalchemy import create_engine
|
14
|
+
import os
|
15
|
+
import calendar
|
16
|
+
from mdbq.config import set_support
|
17
|
+
from mdbq.config import myconfig
|
18
|
+
import traceback
|
19
|
+
|
20
|
+
warnings.filterwarnings('ignore')
|
21
|
+
"""
|
22
|
+
建表流程:
|
23
|
+
|
24
|
+
|
25
|
+
建表规范:
|
26
|
+
1. 数据库和数据表名如果有字母,必须使用小写,大写在建库后会自动变小写,再次上传数据会找不到数据库(macos和linux都有这种情况)
|
27
|
+
2. 无论是数据库/表/列名还是值,尽量避免特殊字符或者表情符号,数据库/表/列名尽量都使用 `列名` 转义,避免错误
|
28
|
+
3. 小数必须使用 decimal, 禁止 float 和 double, 因为计算精度差异,后续需要聚合数据时会引发很多问题
|
29
|
+
|
30
|
+
"""
|
31
|
+
error_file = os.path.join(set_support.SetSupport(dirname='support').dirname, 'error.log')
|
32
|
+
|
33
|
+
|
34
|
+
def is_valid_date(date_string):
|
35
|
+
"""
|
36
|
+
判断是否是日期格式, 且允许没有前导零, 且允许带时间
|
37
|
+
纯日期格式: 返回 1
|
38
|
+
日期+时间: 返回 2
|
39
|
+
"""
|
40
|
+
date_pattern = r"^(\d{4})-(0?[1-9]|1[0-2])-(0?[1-9]|[12]\d|3[01])$"
|
41
|
+
match = re.match(date_pattern, str(date_string)) # 判断纯日期格式:2024-11-09
|
42
|
+
if match is None:
|
43
|
+
date_pattern = r".*\d+:\d+:\d+$"
|
44
|
+
match = re.match(date_pattern, date_string) # 判断日期+时间:2024-11-09 00:36:45
|
45
|
+
if match is not None:
|
46
|
+
return 2
|
47
|
+
else:
|
48
|
+
return 1
|
49
|
+
|
50
|
+
|
51
|
+
def is_integer(int_str):
|
52
|
+
""" 判断是否整数, 允许包含千分位分隔符, 允许科学计数法 """
|
53
|
+
# 如果是科学计数法
|
54
|
+
match = re.findall(r'^[-+]?(\d+)\.(\d+)[eE][-+]?(\d+)$', str(int_str))
|
55
|
+
if match:
|
56
|
+
if len(match[0]) == 3:
|
57
|
+
if int(match[0][0]) == 0: # 0 开头
|
58
|
+
if int(match[0][2]) > 10: # 转换后整数长度超过 10 位
|
59
|
+
return False
|
60
|
+
else: # 不是 0 开头
|
61
|
+
if len(match[0][0]) + int(match[0][2]) > 10: # 转换后整数长度超过 10 位
|
62
|
+
return False
|
63
|
+
if int(match[0][2]) >= len(match[0][1]):
|
64
|
+
return True
|
65
|
+
else:
|
66
|
+
return False
|
67
|
+
# 如果是普通数字, 且允许千分符
|
68
|
+
__pattern = r'^[-+]?\d{1,3}(,\d{3}){0,3}$|^[-+]?\d{1,9}$'
|
69
|
+
return re.match(__pattern, str(int_str)) is not None
|
70
|
+
|
71
|
+
|
72
|
+
def count_decimal_places(num_str):
|
73
|
+
""" 计算小数位数, 允许科学计数法 """
|
74
|
+
match = re.match(r'^[-+]?\d+(\.\d+)?([eE][-+]?\d+)?$', str(num_str))
|
75
|
+
if match:
|
76
|
+
# 如果是科学计数法
|
77
|
+
match = re.findall(r'(\d+)\.(\d+)[eE][-+]?(\d+)$', str(num_str))
|
78
|
+
if match:
|
79
|
+
if len(match[0]) == 3:
|
80
|
+
if int(match[0][2]) < len(match[0][1]):
|
81
|
+
# count_int 清除整数部分开头的 0 并计算整数位数
|
82
|
+
count_int = len(re.sub('^0+', '', str(match[0][0]))) + int(match[0][2])
|
83
|
+
# 计算小数位数
|
84
|
+
count_float = len(match[0][1]) - int(match[0][2])
|
85
|
+
return count_int, count_float
|
86
|
+
# 如果是普通小数
|
87
|
+
match = re.findall(r'(\d+)\.(\d+)$', str(num_str))
|
88
|
+
if match:
|
89
|
+
count_int = len(re.sub('^0+', '', str(match[0][0])))
|
90
|
+
count_float = len(match[0][1])
|
91
|
+
return count_int, count_float # 计算小数位数
|
92
|
+
return 0, 0
|
93
|
+
|
94
|
+
|
95
|
+
class MysqlUpload:
|
96
|
+
def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
|
97
|
+
self.username = username
|
98
|
+
self.password = password
|
99
|
+
self.host = host
|
100
|
+
self.port = port
|
101
|
+
if username == '' or password == '' or host == '' or port == 0:
|
102
|
+
self.config = None
|
103
|
+
else:
|
104
|
+
self.config = {
|
105
|
+
'host': self.host,
|
106
|
+
'port': int(self.port),
|
107
|
+
'user': self.username,
|
108
|
+
'password': self.password,
|
109
|
+
'charset': charset, # utf8mb4 支持存储四字节的UTF-8字符集
|
110
|
+
'cursorclass': pymysql.cursors.DictCursor,
|
111
|
+
}
|
112
|
+
self.filename = None
|
113
|
+
|
114
|
+
@staticmethod
|
115
|
+
def try_except(func): # 在类内部定义一个异常处理方法
|
116
|
+
|
117
|
+
@wraps(func)
|
118
|
+
def wrapper(*args, **kwargs):
|
119
|
+
try:
|
120
|
+
return func(*args, **kwargs)
|
121
|
+
except Exception as e:
|
122
|
+
print(f'{func.__name__}, {e}') # 将异常信息返回
|
123
|
+
with open(error_file, 'a') as f:
|
124
|
+
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
125
|
+
f.write(f'\n{now} \n')
|
126
|
+
f.write(f'函数注释内容(用于定位函数): {func.__doc__} \n')
|
127
|
+
# f.write(f'报错的文件:\n{e.__traceback__.tb_frame.f_globals["__file__"]}\n') # 发生异常所在的文件
|
128
|
+
traceback.print_exc(file=open(error_file, 'a')) # 返回完整的堆栈信息
|
129
|
+
print(f'更多信息请查看日志文件: {error_file}')
|
130
|
+
|
131
|
+
return wrapper
|
132
|
+
|
133
|
+
def keep_connect(self, _db_name, _config, max_try: int=10):
|
134
|
+
attempts = 1
|
135
|
+
while attempts <= max_try:
|
136
|
+
try:
|
137
|
+
connection = pymysql.connect(**_config) # 连接数据库
|
138
|
+
return connection
|
139
|
+
except Exception as e:
|
140
|
+
print(f'连接失败,正在重试: {attempts}/{max_try} {e}')
|
141
|
+
attempts += 1
|
142
|
+
time.sleep(30)
|
143
|
+
print(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
|
144
|
+
return None
|
145
|
+
|
146
|
+
def cover_doc_dtypes(self, dict_data):
|
147
|
+
""" 清理字典键值 并转换数据类型 """
|
148
|
+
if not dict_data:
|
149
|
+
print(f'mysql.py -> MysqlUpload -> cover_dict_dtypes -> 传入的字典不能为空')
|
150
|
+
return
|
151
|
+
__res_dict = {}
|
152
|
+
new_dict_data = {}
|
153
|
+
for k, v in dict_data.items():
|
154
|
+
k = str(k).lower()
|
155
|
+
k = re.sub(r'[()\-,,$&~^、 ()\"\'“”=·/。》《><!!`]', '_', k, re.IGNORECASE)
|
156
|
+
k = k.replace(')', '')
|
157
|
+
k = re.sub(r'_{2,}', '_', k)
|
158
|
+
k = re.sub(r'_+$', '', k)
|
159
|
+
result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
|
160
|
+
result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
|
161
|
+
result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
|
162
|
+
result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
|
163
|
+
|
164
|
+
date_type = is_valid_date(v) # 判断日期时间
|
165
|
+
int_num = is_integer(v) # 判断整数
|
166
|
+
count_int, count_float = count_decimal_places(v) # 判断小数,返回小数位数
|
167
|
+
if result1: # 京东sku/spu商品信息
|
168
|
+
__res_dict.update({k: 'varchar(100)'})
|
169
|
+
elif k == '日期':
|
170
|
+
__res_dict.update({k: 'DATE'})
|
171
|
+
elif k == '更新时间':
|
172
|
+
__res_dict.update({k: 'TIMESTAMP'})
|
173
|
+
elif result2: # 小数
|
174
|
+
__res_dict.update({k: 'decimal(10,4)'})
|
175
|
+
elif date_type == 1: # 纯日期
|
176
|
+
__res_dict.update({k: 'DATE'})
|
177
|
+
elif date_type == 2: # 日期+时间
|
178
|
+
__res_dict.update({k: 'DATETIME'})
|
179
|
+
elif int_num:
|
180
|
+
__res_dict.update({k: 'INT'})
|
181
|
+
elif count_float > 0:
|
182
|
+
if count_int + count_float > 10:
|
183
|
+
# if count_float > 5:
|
184
|
+
# v = round(float(v), 4)
|
185
|
+
if count_float >= 6:
|
186
|
+
__res_dict.update({k: 'decimal(14,6)'})
|
187
|
+
else:
|
188
|
+
__res_dict.update({k: 'decimal(14,4)'})
|
189
|
+
elif count_float >= 6:
|
190
|
+
__res_dict.update({k: 'decimal(14,6)'})
|
191
|
+
elif count_float >= 4:
|
192
|
+
__res_dict.update({k: 'decimal(12,4)'})
|
193
|
+
else:
|
194
|
+
__res_dict.update({k: 'decimal(10,2)'})
|
195
|
+
else:
|
196
|
+
__res_dict.update({k: 'varchar(255)'})
|
197
|
+
new_dict_data.update({k: v})
|
198
|
+
__res_dict.update({'数据主体': 'longblob'})
|
199
|
+
return __res_dict, new_dict_data
|
200
|
+
|
201
|
+
@try_except
|
202
|
+
def doc_to_sql(self, db_name, table_name, dict_data, set_typ={}, remove_by_key=None, allow_not_null=False, filename=None, reset_id=False):
|
203
|
+
"""
|
204
|
+
db_name:
|
205
|
+
table_name:
|
206
|
+
remove_by_key: 设置时先删除数据再插入,不设置则直接添加
|
207
|
+
dict_data:
|
208
|
+
set_typ:
|
209
|
+
allow_not_null:
|
210
|
+
filename:
|
211
|
+
reset_id:
|
212
|
+
"""
|
213
|
+
if not self.config:
|
214
|
+
return
|
215
|
+
if '数据主体' not in dict_data.keys():
|
216
|
+
print(f'dict_data 中"数据主体"键不能为空')
|
217
|
+
return
|
218
|
+
|
219
|
+
# connection = pymysql.connect(**self.config) # 连接数据库
|
220
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
221
|
+
if not connection:
|
222
|
+
return
|
223
|
+
with connection.cursor() as cursor:
|
224
|
+
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
225
|
+
database_exists = cursor.fetchone()
|
226
|
+
if not database_exists:
|
227
|
+
# 如果数据库不存在,则新建
|
228
|
+
if '8.138.27' in str(self.host) or platform.system() == "Linux": # 阿里云 mysql 低版本不支持 0900
|
229
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_unicode_ci"
|
230
|
+
self.config.update({'charset': 'utf8mb4_unicode_ci'})
|
231
|
+
if '192.168.1.100' in str(self.host):
|
232
|
+
sql = f"CREATE DATABASE `{db_name}`"
|
233
|
+
else:
|
234
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
235
|
+
cursor.execute(sql)
|
236
|
+
connection.commit()
|
237
|
+
print(f"创建Database: {db_name}")
|
238
|
+
|
239
|
+
self.config.update({'database': db_name}) # 添加更新 config 字段
|
240
|
+
# connection = pymysql.connect(**self.config) # 重新连接数据库
|
241
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
242
|
+
if not connection:
|
243
|
+
return
|
244
|
+
with connection.cursor() as cursor:
|
245
|
+
# 1. 查询表, 不存在则创建一个空表
|
246
|
+
sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
|
247
|
+
cursor.execute(sql, (table_name))
|
248
|
+
if not cursor.fetchone():
|
249
|
+
sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
|
250
|
+
cursor.execute(sql)
|
251
|
+
print(f'创建 mysql 表: {table_name}')
|
252
|
+
|
253
|
+
new_dict = {}
|
254
|
+
[new_dict.update({k: v}) for k, v in dict_data.items() if k != '数据主体']
|
255
|
+
# 清理列名中的非法字符
|
256
|
+
dtypes, new_dict = self.cover_doc_dtypes(new_dict)
|
257
|
+
if set_typ:
|
258
|
+
# 更新自定义的列数据类型
|
259
|
+
for k, v in dtypes.items():
|
260
|
+
# 确保传进来的 set_typ 键存在于实际的 df 列才 update
|
261
|
+
[dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
|
262
|
+
|
263
|
+
# 检查列
|
264
|
+
sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
|
265
|
+
cursor.execute(sql, (db_name, table_name))
|
266
|
+
col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()] # 已存在的所有列
|
267
|
+
|
268
|
+
col_not_exist = [col for col in set_typ.keys() if col not in col_exist] # 不存在的列
|
269
|
+
# 不存在则新建列
|
270
|
+
if col_not_exist: # 数据表中不存在的列
|
271
|
+
for col in col_not_exist:
|
272
|
+
# 创建列,需转义
|
273
|
+
if allow_not_null:
|
274
|
+
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {set_typ[col]};"
|
275
|
+
else:
|
276
|
+
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {set_typ[col]} NOT NULL;"
|
277
|
+
cursor.execute(sql)
|
278
|
+
print(f"添加列: {col}({set_typ[col]})") # 添加列并指定数据类型
|
279
|
+
|
280
|
+
if col == '日期':
|
281
|
+
sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
|
282
|
+
print(f"设置为索引: {col}({set_typ[col]})")
|
283
|
+
cursor.execute(sql)
|
284
|
+
connection.commit() # 提交事务
|
285
|
+
|
286
|
+
if remove_by_key:
|
287
|
+
# 删除数据
|
288
|
+
se_key = ', '.join(remove_by_key)
|
289
|
+
condition = []
|
290
|
+
for up_col in remove_by_key:
|
291
|
+
condition += [f'`{up_col}` = "{dict_data[up_col]}"']
|
292
|
+
condition = ' AND '.join(condition)
|
293
|
+
# print(condition)
|
294
|
+
sql = f"SELECT {se_key} FROM `{table_name}` WHERE {condition}"
|
295
|
+
cursor.execute(sql)
|
296
|
+
result = cursor.fetchall()
|
297
|
+
if result:
|
298
|
+
sql = f'DELETE FROM `{table_name}` WHERE {condition};'
|
299
|
+
cursor.execute(sql)
|
300
|
+
|
301
|
+
# 插入数据到数据库
|
302
|
+
# 有数据格式错误问题,所以分开处理,将数据主体移到最后面用占位符
|
303
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
304
|
+
print(f'{now} 正在更新: mysql ({self.host}:{self.port}) {db_name}/{table_name} -> {filename}')
|
305
|
+
if new_dict:
|
306
|
+
cols = ', '.join(f"`{item}`" for item in new_dict.keys()) # 列名需要转义
|
307
|
+
values = ', '.join([f'"{item}"' for item in new_dict.values()]) # 值要加引号
|
308
|
+
cols = ', '.join([cols, '数据主体'])
|
309
|
+
binary_data = dict_data['数据主体']
|
310
|
+
sql = f"INSERT INTO `{table_name}` ({cols}) VALUES ({values}, %s)"
|
311
|
+
# print(sql)
|
312
|
+
cursor.execute(sql, binary_data)
|
313
|
+
else:
|
314
|
+
sql = f"""INSERT INTO `{table_name}` (数据主体) VALUES (%s);"""
|
315
|
+
cursor.execute(sql, dict_data['数据主体'])
|
316
|
+
|
317
|
+
if reset_id:
|
318
|
+
# 6. 重置自增列
|
319
|
+
try:
|
320
|
+
# 查询所有复合主键
|
321
|
+
sql = (
|
322
|
+
f"SELECT `COLUMN_NAME` AS `PrimaryKey` FROM `information_schema`.`COLUMNS` "
|
323
|
+
f"WHERE `TABLE_SCHEMA` = '{db_name}'AND `TABLE_NAME` = '{table_name}' AND `COLUMN_KEY` = 'PRI';"
|
324
|
+
)
|
325
|
+
cursor.execute(sql)
|
326
|
+
result = cursor.fetchall() # 复合主键数
|
327
|
+
if len(result) <= 1: # 如果存在复合主键,则不能直接删除 id 键,其他主键可能不是唯一,会报错
|
328
|
+
column_name = 'id'
|
329
|
+
sql = (f'SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS '
|
330
|
+
f'WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND COLUMN_NAME = %s')
|
331
|
+
# cursor.execute(f"SHOW COLUMNS FROM `{table_name}` LIKE 'id'")
|
332
|
+
cursor.execute(sql, (db_name, table_name, column_name))
|
333
|
+
result = cursor.fetchone()
|
334
|
+
if result:
|
335
|
+
# cursor.execute(f"ALTER TABLE `{table_name}` DROP COLUMN id;") # 删除 id 列
|
336
|
+
sql = f"ALTER TABLE `{table_name}` DROP COLUMN {column_name}" # 删除 id 列
|
337
|
+
cursor.execute(sql)
|
338
|
+
cursor.execute(
|
339
|
+
f"ALTER TABLE `{table_name}` ADD column id INT AUTO_INCREMENT PRIMARY KEY FIRST;")
|
340
|
+
cursor.execute(f"ALTER TABLE `{table_name}` AUTO_INCREMENT = 1") # 设置自增从 1 开始
|
341
|
+
# print(f'重置自增id')
|
342
|
+
else:
|
343
|
+
print(f'{table_name} 存在复合主键: 存在复合主键: {[item['PrimaryKey'] for item in result]}, 无法重置自增id')
|
344
|
+
except Exception as e:
|
345
|
+
print(f'333 {table_name} {e}')
|
346
|
+
connection.rollback()
|
347
|
+
connection.commit()
|
348
|
+
|
349
|
+
@try_except
|
350
|
+
def insert_many_dict(self, db_name, table_name, dict_data_list, icm_update=None, main_key=None, unique_main_key=None, index_length=100, set_typ=None, allow_not_null=False, cut_data=None):
|
351
|
+
"""
|
352
|
+
插入字典数据
|
353
|
+
dict_data: 字典
|
354
|
+
main_key: 指定索引列, 通常用日期列,默认会设置日期为索引
|
355
|
+
unique_main_key: 指定唯一索引列
|
356
|
+
index_length: 索引长度
|
357
|
+
icm_update: 增量更正,指定后 main_key 只用于检查/创建列,不能更新数据
|
358
|
+
set_typ: {}
|
359
|
+
allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
|
360
|
+
"""
|
361
|
+
if not self.config:
|
362
|
+
return
|
363
|
+
if icm_update:
|
364
|
+
if main_key or unique_main_key:
|
365
|
+
print(f'icm_update/unique_main_key/unique_main_key 参数不能同时设定')
|
366
|
+
return
|
367
|
+
if not main_key:
|
368
|
+
main_key = []
|
369
|
+
if not unique_main_key:
|
370
|
+
unique_main_key = []
|
371
|
+
|
372
|
+
if not dict_data_list:
|
373
|
+
print(f'dict_data_list 不能为空 ')
|
374
|
+
return
|
375
|
+
dict_data = dict_data_list[0]
|
376
|
+
if cut_data:
|
377
|
+
if '日期' in dict_data.keys():
|
378
|
+
try:
|
379
|
+
__y = pd.to_datetime(dict_data['日期']).strftime('%Y')
|
380
|
+
__y_m = pd.to_datetime(dict_data['日期']).strftime('%Y-%m')
|
381
|
+
if str(cut_data).lower() == 'year':
|
382
|
+
table_name = f'{table_name}_{__y}'
|
383
|
+
elif str(cut_data).lower() == 'month':
|
384
|
+
table_name = f'{table_name}_{__y_m}'
|
385
|
+
else:
|
386
|
+
print(f'参数不正确,cut_data应为 year 或 month ')
|
387
|
+
except Exception as e:
|
388
|
+
print(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
|
389
|
+
|
390
|
+
# connection = pymysql.connect(**self.config) # 连接数据库
|
391
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
392
|
+
if not connection:
|
393
|
+
return
|
394
|
+
with connection.cursor() as cursor:
|
395
|
+
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
396
|
+
database_exists = cursor.fetchone()
|
397
|
+
if not database_exists:
|
398
|
+
# 如果数据库不存在,则新建
|
399
|
+
if '8.138.27' in str(self.host) or platform.system() == "Linux": # 阿里云 mysql 低版本不支持 0900
|
400
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_unicode_ci"
|
401
|
+
self.config.update({'charset': 'utf8mb4_unicode_ci'})
|
402
|
+
if '192.168.1.100' in str(self.host):
|
403
|
+
sql = f"CREATE DATABASE `{db_name}`"
|
404
|
+
else:
|
405
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
406
|
+
cursor.execute(sql)
|
407
|
+
connection.commit()
|
408
|
+
print(f"创建Database: {db_name}")
|
409
|
+
|
410
|
+
self.config.update({'database': db_name}) # 添加更新 config 字段
|
411
|
+
# connection = pymysql.connect(**self.config) # 重新连接数据库
|
412
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
413
|
+
if not connection:
|
414
|
+
return
|
415
|
+
with connection.cursor() as cursor:
|
416
|
+
# 1. 查询表, 不存在则创建一个空表
|
417
|
+
sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
|
418
|
+
cursor.execute(sql, (table_name))
|
419
|
+
if not cursor.fetchone():
|
420
|
+
sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
|
421
|
+
cursor.execute(sql)
|
422
|
+
print(f'创建 mysql 表: {table_name}')
|
423
|
+
|
424
|
+
# 根据 dict_data 的值添加指定的数据类型
|
425
|
+
dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
|
426
|
+
if set_typ:
|
427
|
+
# 更新自定义的列数据类型
|
428
|
+
for k, v in dtypes.items():
|
429
|
+
# 确保传进来的 set_typ 键存在于实际的 df 列才 update
|
430
|
+
[dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
|
431
|
+
|
432
|
+
# 检查列
|
433
|
+
sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
|
434
|
+
cursor.execute(sql, (db_name, table_name))
|
435
|
+
col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()] # 已存在的所有列
|
436
|
+
col_not_exist = [col for col in dict_data.keys() if col not in col_exist] # 不存在的列
|
437
|
+
# 不存在则新建列
|
438
|
+
if col_not_exist: # 数据表中不存在的列
|
439
|
+
for col in col_not_exist:
|
440
|
+
# 创建列,需转义
|
441
|
+
if allow_not_null:
|
442
|
+
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]};"
|
443
|
+
else:
|
444
|
+
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
|
445
|
+
# sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
|
446
|
+
# print(sql)
|
447
|
+
cursor.execute(sql)
|
448
|
+
print(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
|
449
|
+
|
450
|
+
if col in main_key or col == '日期':
|
451
|
+
sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
|
452
|
+
print(f"设置为索引: {col}({dtypes[col]})")
|
453
|
+
cursor.execute(sql)
|
454
|
+
if col in unique_main_key:
|
455
|
+
if dtypes[col] == 'mediumtext':
|
456
|
+
sql = f"ALTER TABLE `{table_name}` ADD UNIQUE (`{col}`({index_length}))"
|
457
|
+
else:
|
458
|
+
sql = f"ALTER TABLE `{table_name}` ADD UNIQUE (`{col}`)"
|
459
|
+
cursor.execute(sql)
|
460
|
+
# if col in main_key or col in unique_main_key:
|
461
|
+
# sql = f"SHOW INDEXES FROM `{table_name}` WHERE `Column_name` = %s"
|
462
|
+
# cursor.execute(sql, (col))
|
463
|
+
# result = cursor.fetchone() # 检查索引是否存在
|
464
|
+
# if not result:
|
465
|
+
# if col in main_key:
|
466
|
+
# sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
|
467
|
+
# print(f"设置为索引: {col}({dtypes[col]})")
|
468
|
+
# cursor.execute(sql)
|
469
|
+
# elif col in unique_main_key:
|
470
|
+
# if dtypes[col] == 'mediumtext':
|
471
|
+
# sql = f"CREATE INDEX UNIQUE index_name ON `{table_name}` (`{col}`({index_length}));"
|
472
|
+
# else:
|
473
|
+
# sql = f"CREATE INDEX UNIQUE index_name ON `{table_name}` (`{col}`);"
|
474
|
+
# print(f"设置唯一索引: {col}({dtypes[col]})")
|
475
|
+
# print(sql)
|
476
|
+
# cursor.execute(sql)
|
477
|
+
connection.commit() # 提交事务
|
478
|
+
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
479
|
+
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
480
|
+
# 处理插入的数据
|
481
|
+
for dict_data in dict_data_list:
|
482
|
+
# print(dict_data)
|
483
|
+
dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
|
484
|
+
if icm_update:
|
485
|
+
""" 使用增量更新: 需确保 icm_update['主键'] 传进来的列组合是数据表中唯一,值不会发生变化且不会重复,否则可能产生覆盖 """
|
486
|
+
sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
|
487
|
+
cursor.execute(sql, (db_name, {table_name}))
|
488
|
+
columns = cursor.fetchall()
|
489
|
+
cols_exist = [col['COLUMN_NAME'] for col in columns] # 数据表的所有列, 返回 list
|
490
|
+
update_col = [item for item in cols_exist if item not in icm_update and item != 'id'] # 除了主键外的其他列
|
491
|
+
|
492
|
+
# unique_keys 示例: `日期`, `余额`
|
493
|
+
unique_keys = ', '.join(f"`{item}`" for item in update_col) # 列名需要转义
|
494
|
+
condition = []
|
495
|
+
for up_col in icm_update:
|
496
|
+
condition += [f'`{up_col}` = "{dict_data[up_col]}"']
|
497
|
+
condition = ' AND '.join(condition) # condition值示例: `品销宝余额` = '2930.73' AND `短信剩余` = '67471'
|
498
|
+
sql = f"SELECT {unique_keys} FROM `{table_name}` WHERE {condition}"
|
499
|
+
# print(sql)
|
500
|
+
# sql = f"SELECT {unique_keys} FROM `{table_name}` WHERE `创建时间` = '2014-09-19 14:32:33'"
|
501
|
+
cursor.execute(sql)
|
502
|
+
results = cursor.fetchall() # results 是数据库取出的数据
|
503
|
+
if results: # 有数据返回,再进行增量检查
|
504
|
+
for result in results: # results 是数据库数据, dict_data 是传进来的数据
|
505
|
+
change_col = [] # 发生变化的列名
|
506
|
+
change_values = [] # 发生变化的数据
|
507
|
+
for col in update_col:
|
508
|
+
# 因为 mysql 里面有 decimal 数据类型,要移除末尾的 0 再做比较(df 默认将 5.00 小数截断为 5.0)
|
509
|
+
df_value = str(dict_data[col])
|
510
|
+
mysql_value = str(result[col])
|
511
|
+
if '.' in df_value:
|
512
|
+
df_value = re.sub(r'0+$', '', df_value)
|
513
|
+
df_value = re.sub(r'\.$', '', df_value)
|
514
|
+
if '.' in mysql_value:
|
515
|
+
mysql_value = re.sub(r'0+$', '', mysql_value)
|
516
|
+
mysql_value = re.sub(r'\.$', '', mysql_value)
|
517
|
+
if df_value != mysql_value: # 传进来的数据和数据库比较, 有变化
|
518
|
+
# print(f'{dict_data['日期']}{dict_data['商品id']}{col} 列的值有变化,{str(dict_data[col])} != {str(result[col])}')
|
519
|
+
change_values += [f"`{col}` = \"{str(dict_data[col])}\""]
|
520
|
+
change_col.append(col)
|
521
|
+
not_change_col = [item for item in update_col if item not in change_col]
|
522
|
+
# change_values 是 df 传进来且和数据库对比后,发生了变化的数据,值示例: [`品销宝余额` = '9999.0', `短信剩余` = '888']
|
523
|
+
if change_values: # change_values 有数据返回,表示值需要更新
|
524
|
+
if not_change_col:
|
525
|
+
not_change_values = [f'`{col}` = "{str(dict_data[col])}"' for col in not_change_col]
|
526
|
+
not_change_values = ' AND '.join(
|
527
|
+
not_change_values) # 示例: `短信剩余` = '888' AND `test1` = '93'
|
528
|
+
# print(change_values, not_change_values)
|
529
|
+
condition += f' AND {not_change_values}' # 重新构建完整的查询条件,将未发生变化的列加进查询条件
|
530
|
+
change_values = ', '.join(f"{item}" for item in change_values) # 注意这里 item 外面没有反引号
|
531
|
+
sql = "UPDATE `%s` SET %s WHERE %s" % (table_name, change_values, condition)
|
532
|
+
# print(sql)
|
533
|
+
cursor.execute(sql)
|
534
|
+
else: # 没有数据返回,则直接插入数据
|
535
|
+
cols = ', '.join(f"`{item}`" for item in dict_data.keys()) # 列名需要转义
|
536
|
+
# data.update({item: f"{data[item]}" for item in data.keys()}) # 全部值转字符, 不是必须的
|
537
|
+
values = ', '.join([f'"{item}"' for item in dict_data.values()]) # 值要加引号
|
538
|
+
sql = f"INSERT INTO `{table_name}` ({cols}) VALUES ({values});"
|
539
|
+
cursor.execute(sql)
|
540
|
+
connection.commit() # 提交数据库
|
541
|
+
continue
|
542
|
+
|
543
|
+
# 构建 keys
|
544
|
+
keys_data = ', '.join([f'`{str(item)}`' for item in dict_data.keys()])
|
545
|
+
# 构建 values
|
546
|
+
values_data = ', '.join(f'"{str(item)}"' for item in dict_data.values())
|
547
|
+
# 构建其他键值,重复时要更新的其他键
|
548
|
+
if main_key:
|
549
|
+
for col in main_key:
|
550
|
+
del dict_data[col]
|
551
|
+
if unique_main_key:
|
552
|
+
for col in unique_main_key:
|
553
|
+
del dict_data[col]
|
554
|
+
# 涉及列名务必使用反引号
|
555
|
+
update_datas = ', '.join([f'`{k}` = VALUES(`{k}`)' for k, v in dict_data.items()])
|
556
|
+
|
557
|
+
# 构建 sql
|
558
|
+
sql = f"INSERT INTO %s (%s) VALUES (%s) ON DUPLICATE KEY UPDATE %s" % (table_name, keys_data, values_data, update_datas)
|
559
|
+
# print(sql)
|
560
|
+
cursor.execute(sql)
|
561
|
+
connection.commit() # 提交数据库
|
562
|
+
connection.close()
|
563
|
+
|
564
|
+
@try_except
|
565
|
+
def dict_to_mysql(self, db_name, table_name, dict_data, icm_update=None, main_key=None, unique_main_key=None, index_length=100, set_typ=None, allow_not_null=False, cut_data=None):
|
566
|
+
"""
|
567
|
+
插入字典数据
|
568
|
+
dict_data: 字典
|
569
|
+
main_key: 指定索引列, 通常用日期列,默认会设置日期为索引
|
570
|
+
unique_main_key: 指定唯一索引列
|
571
|
+
index_length: 索引长度
|
572
|
+
icm_update: 增量更正,指定后 main_key 只用于检查/创建列,不能更新数据
|
573
|
+
set_typ: {}
|
574
|
+
allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
|
575
|
+
"""
|
576
|
+
if not self.config:
|
577
|
+
return
|
578
|
+
if icm_update:
|
579
|
+
if main_key or unique_main_key:
|
580
|
+
print(f'icm_update/unique_main_key/unique_main_key 参数不能同时设定')
|
581
|
+
return
|
582
|
+
if not main_key:
|
583
|
+
main_key = []
|
584
|
+
if not unique_main_key:
|
585
|
+
unique_main_key = []
|
586
|
+
|
587
|
+
if cut_data:
|
588
|
+
if '日期' in dict_data.keys():
|
589
|
+
try:
|
590
|
+
__y = pd.to_datetime(dict_data['日期']).strftime('%Y')
|
591
|
+
__y_m = pd.to_datetime(dict_data['日期']).strftime('%Y-%m')
|
592
|
+
if str(cut_data).lower() == 'year':
|
593
|
+
table_name = f'{table_name}_{__y}'
|
594
|
+
elif str(cut_data).lower() == 'month':
|
595
|
+
table_name = f'{table_name}_{__y_m}'
|
596
|
+
else:
|
597
|
+
print(f'参数不正确,cut_data应为 year 或 month ')
|
598
|
+
except Exception as e:
|
599
|
+
print(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
|
600
|
+
|
601
|
+
# connection = pymysql.connect(**self.config) # 连接数据库
|
602
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
603
|
+
if not connection:
|
604
|
+
return
|
605
|
+
with connection.cursor() as cursor:
|
606
|
+
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
607
|
+
database_exists = cursor.fetchone()
|
608
|
+
if not database_exists:
|
609
|
+
# 如果数据库不存在,则新建
|
610
|
+
if '8.138.27' in str(self.host) or platform.system() == "Linux": # 阿里云 mysql 低版本不支持 0900
|
611
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_unicode_ci"
|
612
|
+
self.config.update({'charset': 'utf8mb4_unicode_ci'})
|
613
|
+
if '192.168.1.100' in str(self.host):
|
614
|
+
sql = f"CREATE DATABASE `{db_name}`"
|
615
|
+
else:
|
616
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
617
|
+
cursor.execute(sql)
|
618
|
+
connection.commit()
|
619
|
+
print(f"创建Database: {db_name}")
|
620
|
+
|
621
|
+
self.config.update({'database': db_name}) # 添加更新 config 字段
|
622
|
+
# connection = pymysql.connect(**self.config) # 重新连接数据库
|
623
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
624
|
+
if not connection:
|
625
|
+
return
|
626
|
+
with connection.cursor() as cursor:
|
627
|
+
# 1. 查询表, 不存在则创建一个空表
|
628
|
+
sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
|
629
|
+
cursor.execute(sql, (table_name))
|
630
|
+
if not cursor.fetchone():
|
631
|
+
sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
|
632
|
+
cursor.execute(sql)
|
633
|
+
print(f'创建 mysql 表: {table_name}')
|
634
|
+
|
635
|
+
# 根据 dict_data 的值添加指定的数据类型
|
636
|
+
dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
|
637
|
+
if set_typ:
|
638
|
+
# 更新自定义的列数据类型
|
639
|
+
for k, v in dtypes.items():
|
640
|
+
# 确保传进来的 set_typ 键存在于实际的 df 列才 update
|
641
|
+
[dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
|
642
|
+
|
643
|
+
# 检查列
|
644
|
+
sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
|
645
|
+
cursor.execute(sql, (db_name, table_name))
|
646
|
+
col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()] # 已存在的所有列
|
647
|
+
col_not_exist = [col for col in dict_data.keys() if col not in col_exist] # 不存在的列
|
648
|
+
# 不存在则新建列
|
649
|
+
if col_not_exist: # 数据表中不存在的列
|
650
|
+
for col in col_not_exist:
|
651
|
+
# 创建列,需转义
|
652
|
+
if allow_not_null:
|
653
|
+
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]};"
|
654
|
+
else:
|
655
|
+
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
|
656
|
+
# sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
|
657
|
+
# print(sql)
|
658
|
+
cursor.execute(sql)
|
659
|
+
print(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
|
660
|
+
|
661
|
+
if col in main_key or col == '日期':
|
662
|
+
sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
|
663
|
+
print(f"设置为索引: {col}({dtypes[col]})")
|
664
|
+
cursor.execute(sql)
|
665
|
+
if col in unique_main_key:
|
666
|
+
if dtypes[col] == 'mediumtext':
|
667
|
+
sql = f"ALTER TABLE `{table_name}` ADD UNIQUE (`{col}`({index_length}))"
|
668
|
+
else:
|
669
|
+
sql = f"ALTER TABLE `{table_name}` ADD UNIQUE (`{col}`)"
|
670
|
+
cursor.execute(sql)
|
671
|
+
# if col in main_key or col in unique_main_key:
|
672
|
+
# sql = f"SHOW INDEXES FROM `{table_name}` WHERE `Column_name` = %s"
|
673
|
+
# cursor.execute(sql, (col))
|
674
|
+
# result = cursor.fetchone() # 检查索引是否存在
|
675
|
+
# if not result:
|
676
|
+
# if col in main_key:
|
677
|
+
# sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
|
678
|
+
# print(f"设置为索引: {col}({dtypes[col]})")
|
679
|
+
# cursor.execute(sql)
|
680
|
+
# elif col in unique_main_key:
|
681
|
+
# if dtypes[col] == 'mediumtext':
|
682
|
+
# sql = f"CREATE INDEX UNIQUE index_name ON `{table_name}` (`{col}`({index_length}));"
|
683
|
+
# else:
|
684
|
+
# sql = f"CREATE INDEX UNIQUE index_name ON `{table_name}` (`{col}`);"
|
685
|
+
# print(f"设置唯一索引: {col}({dtypes[col]})")
|
686
|
+
# print(sql)
|
687
|
+
# cursor.execute(sql)
|
688
|
+
connection.commit() # 提交事务
|
689
|
+
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
690
|
+
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
691
|
+
# 处理插入的数据
|
692
|
+
if icm_update:
|
693
|
+
""" 使用增量更新: 需确保 icm_update['主键'] 传进来的列组合是数据表中唯一,值不会发生变化且不会重复,否则可能产生覆盖 """
|
694
|
+
sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
|
695
|
+
cursor.execute(sql, (db_name, {table_name}))
|
696
|
+
columns = cursor.fetchall()
|
697
|
+
cols_exist = [col['COLUMN_NAME'] for col in columns] # 数据表的所有列, 返回 list
|
698
|
+
update_col = [item for item in cols_exist if item not in icm_update and item != 'id'] # 除了主键外的其他列
|
699
|
+
|
700
|
+
# unique_keys 示例: `日期`, `余额`
|
701
|
+
unique_keys = ', '.join(f"`{item}`" for item in update_col) # 列名需要转义
|
702
|
+
condition = []
|
703
|
+
for up_col in icm_update:
|
704
|
+
condition += [f'`{up_col}` = "{dict_data[up_col]}"']
|
705
|
+
condition = ' AND '.join(condition) # condition值示例: `品销宝余额` = '2930.73' AND `短信剩余` = '67471'
|
706
|
+
sql = f"SELECT {unique_keys} FROM `{table_name}` WHERE {condition}"
|
707
|
+
# print(sql)
|
708
|
+
# sql = f"SELECT {unique_keys} FROM `{table_name}` WHERE `创建时间` = '2014-09-19 14:32:33'"
|
709
|
+
cursor.execute(sql)
|
710
|
+
results = cursor.fetchall() # results 是数据库取出的数据
|
711
|
+
if results: # 有数据返回,再进行增量检查
|
712
|
+
for result in results: # results 是数据库数据, dict_data 是传进来的数据
|
713
|
+
change_col = [] # 发生变化的列名
|
714
|
+
change_values = [] # 发生变化的数据
|
715
|
+
for col in update_col:
|
716
|
+
# 因为 mysql 里面有 decimal 数据类型,要移除末尾的 0 再做比较(df 默认将 5.00 小数截断为 5.0)
|
717
|
+
df_value = str(dict_data[col])
|
718
|
+
mysql_value = str(result[col])
|
719
|
+
if '.' in df_value:
|
720
|
+
df_value = re.sub(r'0+$', '', df_value)
|
721
|
+
df_value = re.sub(r'\.$', '', df_value)
|
722
|
+
if '.' in mysql_value:
|
723
|
+
mysql_value = re.sub(r'0+$', '', mysql_value)
|
724
|
+
mysql_value = re.sub(r'\.$', '', mysql_value)
|
725
|
+
if df_value != mysql_value: # 传进来的数据和数据库比较, 有变化
|
726
|
+
# print(f'{dict_data['日期']}{dict_data['商品id']}{col} 列的值有变化,{str(dict_data[col])} != {str(result[col])}')
|
727
|
+
change_values += [f"`{col}` = \"{str(dict_data[col])}\""]
|
728
|
+
change_col.append(col)
|
729
|
+
not_change_col = [item for item in update_col if item not in change_col]
|
730
|
+
# change_values 是 df 传进来且和数据库对比后,发生了变化的数据,值示例: [`品销宝余额` = '9999.0', `短信剩余` = '888']
|
731
|
+
if change_values: # change_values 有数据返回,表示值需要更新
|
732
|
+
if not_change_col:
|
733
|
+
not_change_values = [f'`{col}` = "{str(dict_data[col])}"' for col in not_change_col]
|
734
|
+
not_change_values = ' AND '.join(
|
735
|
+
not_change_values) # 示例: `短信剩余` = '888' AND `test1` = '93'
|
736
|
+
# print(change_values, not_change_values)
|
737
|
+
condition += f' AND {not_change_values}' # 重新构建完整的查询条件,将未发生变化的列加进查询条件
|
738
|
+
change_values = ', '.join(f"{item}" for item in change_values) # 注意这里 item 外面没有反引号
|
739
|
+
sql = "UPDATE `%s` SET %s WHERE %s" % (table_name, change_values, condition)
|
740
|
+
# print(sql)
|
741
|
+
cursor.execute(sql)
|
742
|
+
else: # 没有数据返回,则直接插入数据
|
743
|
+
cols = ', '.join(f"`{item}`" for item in dict_data.keys()) # 列名需要转义
|
744
|
+
# data.update({item: f"{data[item]}" for item in data.keys()}) # 全部值转字符, 不是必须的
|
745
|
+
values = ', '.join([f'"{item}"' for item in dict_data.values()]) # 值要加引号
|
746
|
+
sql = f"INSERT INTO `{table_name}` ({cols}) VALUES ({values});"
|
747
|
+
cursor.execute(sql)
|
748
|
+
connection.commit() # 提交数据库
|
749
|
+
connection.close()
|
750
|
+
return
|
751
|
+
|
752
|
+
# 构建 keys
|
753
|
+
keys_data = ', '.join([f'`{str(item)}`' for item in dict_data.keys()])
|
754
|
+
# 构建 values
|
755
|
+
values_data = ', '.join(f'"{str(item)}"' for item in dict_data.values())
|
756
|
+
# 构建其他键值,重复时要更新的其他键
|
757
|
+
if main_key:
|
758
|
+
for col in main_key:
|
759
|
+
del dict_data[col]
|
760
|
+
if unique_main_key:
|
761
|
+
for col in unique_main_key:
|
762
|
+
del dict_data[col]
|
763
|
+
# 涉及列名务必使用反引号
|
764
|
+
update_datas = ', '.join([f'`{k}` = VALUES(`{k}`)' for k, v in dict_data.items()])
|
765
|
+
|
766
|
+
# 构建 sql
|
767
|
+
sql = f"INSERT INTO %s (%s) VALUES (%s) ON DUPLICATE KEY UPDATE %s" % (table_name, keys_data, values_data, update_datas)
|
768
|
+
# print(sql)
|
769
|
+
cursor.execute(sql)
|
770
|
+
connection.commit() # 提交数据库
|
771
|
+
connection.close()
|
772
|
+
|
773
|
+
def cover_dict_dtypes(self, dict_data):
|
774
|
+
""" 清理字典键值 并转换数据类型 """
|
775
|
+
if not dict_data:
|
776
|
+
print(f'mysql.py -> MysqlUpload -> cover_dict_dtypes -> 传入的字典不能为空')
|
777
|
+
return
|
778
|
+
__res_dict = {}
|
779
|
+
new_dict_data = {}
|
780
|
+
for k, v in dict_data.items():
|
781
|
+
k = str(k).lower()
|
782
|
+
k = re.sub(r'[()\-,,$&~^、 ()\"\'“”=·/。》《><!!`]', '_', k, re.IGNORECASE)
|
783
|
+
k = k.replace(')', '')
|
784
|
+
k = re.sub(r'_{2,}', '_', k)
|
785
|
+
k = re.sub(r'_+$', '', k)
|
786
|
+
if str(v) == '':
|
787
|
+
v = 0
|
788
|
+
v = str(v)
|
789
|
+
# v = re.sub('^-$|^--$|^nan$|^null$', '0', v, re.I)
|
790
|
+
# v = re.sub(',|="|"', '', v, re.I)
|
791
|
+
v = re.sub('^="|"$', '', v, re.I)
|
792
|
+
if re.findall(r'^[-+]?\d+\.?\d*%$', v):
|
793
|
+
v = str(float(v.rstrip("%")) / 100)
|
794
|
+
|
795
|
+
result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
|
796
|
+
result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
|
797
|
+
result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
|
798
|
+
result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
|
799
|
+
|
800
|
+
date_type = is_valid_date(v) # 判断日期时间
|
801
|
+
int_num = is_integer(v) # 判断整数
|
802
|
+
count_int, count_float = count_decimal_places(v) # 判断小数,返回小数位数
|
803
|
+
if result1: # 京东sku/spu商品信息
|
804
|
+
__res_dict.update({k: 'varchar(100)'})
|
805
|
+
elif k == '日期':
|
806
|
+
__res_dict.update({k: 'DATE'})
|
807
|
+
elif k == '更新时间':
|
808
|
+
__res_dict.update({k: 'TIMESTAMP'})
|
809
|
+
elif result2: # 小数
|
810
|
+
__res_dict.update({k: 'decimal(10,4)'})
|
811
|
+
elif date_type == 1: # 纯日期
|
812
|
+
__res_dict.update({k: 'DATE'})
|
813
|
+
elif date_type == 2: # 日期+时间
|
814
|
+
__res_dict.update({k: 'DATETIME'})
|
815
|
+
elif int_num:
|
816
|
+
__res_dict.update({k: 'INT'})
|
817
|
+
elif count_float > 0:
|
818
|
+
if count_int + count_float > 10:
|
819
|
+
# if count_float > 5:
|
820
|
+
# v = round(float(v), 4)
|
821
|
+
if count_float >= 6:
|
822
|
+
__res_dict.update({k: 'decimal(14,6)'})
|
823
|
+
else:
|
824
|
+
__res_dict.update({k: 'decimal(14,4)'})
|
825
|
+
elif count_float >= 6:
|
826
|
+
__res_dict.update({k: 'decimal(14,6)'})
|
827
|
+
elif count_float >= 4:
|
828
|
+
__res_dict.update({k: 'decimal(12,4)'})
|
829
|
+
else:
|
830
|
+
__res_dict.update({k: 'decimal(10,2)'})
|
831
|
+
else:
|
832
|
+
__res_dict.update({k: 'varchar(255)'})
|
833
|
+
new_dict_data.update({k: v})
|
834
|
+
return __res_dict, new_dict_data
|
835
|
+
|
836
|
+
def cover_df(self, df):
|
837
|
+
""" 清理 df 的值和列名 """
|
838
|
+
df.replace([np.inf, -np.inf], '0', inplace=True) # 清理一些非法值
|
839
|
+
# df.replace(to_replace=['\\N', '-', '--', '', 'nan', 'NAN'], value='0', regex=False, inplace=True) # 替换掉特殊字符
|
840
|
+
df.replace(to_replace=['\\N', '', 'nan', 'NAN'], value='0', regex=False, inplace=True) # 替换掉特殊字符
|
841
|
+
# df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
842
|
+
df.replace(to_replace=['="'], value='', regex=True, inplace=True) # ="和"不可以放在一起清洗, 因为有: id=86785565
|
843
|
+
df.replace(to_replace=['"'], value='', regex=True, inplace=True)
|
844
|
+
cols = df.columns.tolist()
|
845
|
+
for col in cols:
|
846
|
+
if col == 'id':
|
847
|
+
df.pop('id')
|
848
|
+
continue
|
849
|
+
df[col] = df[col].apply(lambda x: float(re.sub(r'%$', '', str(x))) / 100 if (
|
850
|
+
str(x) != '' and str(x).endswith('%')) and not re.findall('[\\u4e00-\\u9fa5]', str(x)) else '0.0' if str(x) == '0%' else x)
|
851
|
+
try:
|
852
|
+
# 不能直接使用 int() ,对于大数,可能转为uint64,导致数据库入库可能异常
|
853
|
+
df[col] = df[col].apply(
|
854
|
+
lambda x: np.int64(str(x)) if '_' not in str(x) and '.' not in str(x) else x) # 不含小数点尝试转整数
|
855
|
+
except:
|
856
|
+
pass
|
857
|
+
try:
|
858
|
+
if df[col].dtype == 'object': # 有些列没有被 pandas 识别数据类型,会没有 dtype 属性
|
859
|
+
df[col] = df[col].apply(lambda x: float(x) if '.' in str(x) and '_' not in str(x) else x)
|
860
|
+
except:
|
861
|
+
pass
|
862
|
+
new_col = col.lower()
|
863
|
+
new_col = re.sub(r'[()\-,,&~^、 ()\"\'“”=·/。》《><!!`]', '_', new_col, re.IGNORECASE)
|
864
|
+
new_col = new_col.replace(')', '')
|
865
|
+
new_col = re.sub(r'_{2,}', '_', new_col)
|
866
|
+
new_col = re.sub(r'_+$', '', new_col)
|
867
|
+
df.rename(columns={col: new_col}, inplace=True)
|
868
|
+
df.fillna(0, inplace=True)
|
869
|
+
return df
|
870
|
+
|
871
|
+
def convert_df_dtypes(self, df: pd.DataFrame):
|
872
|
+
""" 清理 df 的值和列名,并转换数据类型 """
|
873
|
+
df = self.cover_df(df=df) # 清理 df 的值和列名
|
874
|
+
[pd.to_numeric(df[col], errors='ignore') for col in df.columns.tolist()]
|
875
|
+
dtypes = df.dtypes.to_dict()
|
876
|
+
__res_dict = {}
|
877
|
+
for k, v in dtypes.items():
|
878
|
+
result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
|
879
|
+
result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
|
880
|
+
result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
|
881
|
+
result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
|
882
|
+
|
883
|
+
if result1: # id/sku/spu商品信息
|
884
|
+
__res_dict.update({k: 'varchar(50)'})
|
885
|
+
elif result2: # 小数
|
886
|
+
__res_dict.update({k: 'decimal(10,4)'})
|
887
|
+
elif result3: # 小数
|
888
|
+
__res_dict.update({k: 'decimal(12,4)'})
|
889
|
+
elif result4: # 小数
|
890
|
+
__res_dict.update({k: 'decimal(12,2)'})
|
891
|
+
elif k == '日期':
|
892
|
+
__res_dict.update({k: 'date'})
|
893
|
+
elif k == '更新时间':
|
894
|
+
__res_dict.update({k: 'timestamp'})
|
895
|
+
elif v == 'int64':
|
896
|
+
__res_dict.update({k: 'int'})
|
897
|
+
elif v == 'float64':
|
898
|
+
__res_dict.update({k: 'decimal(10,4)'})
|
899
|
+
elif v == 'bool':
|
900
|
+
__res_dict.update({k: 'boolean'})
|
901
|
+
elif v == 'datetime64[ns]':
|
902
|
+
__res_dict.update({k: 'datetime'})
|
903
|
+
else:
|
904
|
+
__res_dict.update({k: 'varchar(255)'})
|
905
|
+
return __res_dict, df
|
906
|
+
|
907
|
+
@try_except
|
908
|
+
def df_to_mysql(self, df, db_name, table_name, set_typ=None, icm_update=[], move_insert=False, df_sql=False, drop_duplicates=False,
|
909
|
+
filename=None, count=None, reset_id=False, allow_not_null=False, cut_data=None):
|
910
|
+
"""
|
911
|
+
db_name: 数据库名
|
912
|
+
table_name: 表名
|
913
|
+
move_insert: 根据df 的日期,先移除数据库数据,再插入, df_sql, drop_duplicates, icm_update 都要设置为 False
|
914
|
+
原则上只限于聚合数据使用,原始数据插入时不要设置
|
915
|
+
|
916
|
+
df_sql: 这是一个临时参数, 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重,初创表大量上传数据的时候使用
|
917
|
+
drop_duplicates: 值为 True 时检查重复数据再插入,反之直接上传,数据量大时会比较慢
|
918
|
+
icm_update: 增量更新, 在聚合数据中使用,原始文件不要使用,设置此参数时需将 drop_duplicates 改为 False
|
919
|
+
使用增量更新: 必须确保 icm_update 传进来的列必须是数据表中唯一主键,值不会发生变化,不会重复,否则可能产生错乱覆盖情况
|
920
|
+
filename: 用来追踪处理进度,传这个参数是方便定位产生错误的文件
|
921
|
+
allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
|
922
|
+
"""
|
923
|
+
if not self.config:
|
924
|
+
return
|
925
|
+
if icm_update:
|
926
|
+
if move_insert or df_sql or drop_duplicates:
|
927
|
+
print(f'icm_update/move_insert/df_sql/drop_duplicates 参数不能同时设定')
|
928
|
+
return
|
929
|
+
if move_insert:
|
930
|
+
if icm_update or df_sql or drop_duplicates:
|
931
|
+
print(f'icm_update/move_insert/df_sql/drop_duplicates 参数不能同时设定')
|
932
|
+
return
|
933
|
+
|
934
|
+
self.filename = filename
|
935
|
+
if isinstance(df, pd.DataFrame):
|
936
|
+
if len(df) == 0:
|
937
|
+
print(f'{db_name}: {table_name} 传入的 df 数据长度为0, {self.filename}')
|
938
|
+
return
|
939
|
+
else:
|
940
|
+
print(f'{db_name}: {table_name} 传入的 df 不是有效的 dataframe 结构, {self.filename}')
|
941
|
+
return
|
942
|
+
if not db_name or db_name == 'None':
|
943
|
+
print(f'{db_name} 不能为 None')
|
944
|
+
return
|
945
|
+
|
946
|
+
if cut_data:
|
947
|
+
if '日期' in df.columns.tolist():
|
948
|
+
try:
|
949
|
+
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
|
950
|
+
min_year = df['日期'].min(skipna=True).year
|
951
|
+
min_month = df['日期'].min(skipna=True).month
|
952
|
+
if 0 < int(min_month) < 10 and not str(min_month).startswith('0'):
|
953
|
+
min_month = f'0{min_month}'
|
954
|
+
if str(cut_data).lower() == 'year':
|
955
|
+
table_name = f'{table_name}_{min_year}'
|
956
|
+
elif str(cut_data).lower() == 'month':
|
957
|
+
table_name = f'{table_name}_{min_year}-{min_month}'
|
958
|
+
else:
|
959
|
+
print(f'参数不正确,cut_data应为 year 或 month ')
|
960
|
+
except Exception as e:
|
961
|
+
print(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
|
962
|
+
# 清理 dataframe 非法值,并转换获取数据类型
|
963
|
+
dtypes, df = self.convert_df_dtypes(df)
|
964
|
+
if set_typ:
|
965
|
+
# 更新自定义的列数据类型
|
966
|
+
for k, v in dtypes.items():
|
967
|
+
# 确保传进来的 set_typ 键存在于实际的 df 列才 update
|
968
|
+
[dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
|
969
|
+
|
970
|
+
# connection = pymysql.connect(**self.config) # 连接数据库
|
971
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
972
|
+
if not connection:
|
973
|
+
return
|
974
|
+
with connection.cursor() as cursor:
|
975
|
+
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
976
|
+
database_exists = cursor.fetchone()
|
977
|
+
if not database_exists:
|
978
|
+
# 如果数据库不存在,则新建
|
979
|
+
if '8.138.27' in str(self.host) or platform.system() == "Linux": # 阿里云 mysql 低版本不支持 0900
|
980
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_unicode_ci"
|
981
|
+
self.config.update({'charset': 'utf8mb4_unicode_ci'})
|
982
|
+
if '192.168.1.100' in str(self.host):
|
983
|
+
sql = f"CREATE DATABASE `{db_name}`"
|
984
|
+
else:
|
985
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
986
|
+
cursor.execute(sql)
|
987
|
+
connection.commit()
|
988
|
+
print(f"创建Database: {db_name}")
|
989
|
+
|
990
|
+
self.config.update({'database': db_name}) # 添加更新 config 字段
|
991
|
+
# connection = pymysql.connect(**self.config) # 重新连接数据库
|
992
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
993
|
+
if not connection:
|
994
|
+
return
|
995
|
+
with connection.cursor() as cursor:
|
996
|
+
# 1. 查询表, 不存在则创建一个空表
|
997
|
+
sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
|
998
|
+
cursor.execute(sql, (table_name))
|
999
|
+
if not cursor.fetchone():
|
1000
|
+
sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
|
1001
|
+
cursor.execute(sql)
|
1002
|
+
print(f'创建 mysql 表: {table_name}')
|
1003
|
+
|
1004
|
+
# 有特殊字符不需转义
|
1005
|
+
sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
|
1006
|
+
cursor.execute(sql, (db_name, table_name))
|
1007
|
+
col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()]
|
1008
|
+
cols = df.columns.tolist()
|
1009
|
+
col_not_exist = [col for col in cols if col not in col_exist]
|
1010
|
+
|
1011
|
+
# 检查列,不存在则新建列
|
1012
|
+
if col_not_exist: # 数据表中不存在的列
|
1013
|
+
for col in col_not_exist:
|
1014
|
+
# 创建列,需转义
|
1015
|
+
if allow_not_null:
|
1016
|
+
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]};"
|
1017
|
+
else:
|
1018
|
+
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
|
1019
|
+
cursor.execute(sql)
|
1020
|
+
print(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
|
1021
|
+
|
1022
|
+
# 创建索引
|
1023
|
+
if col == '日期':
|
1024
|
+
sql = f"SHOW INDEXES FROM `{table_name}` WHERE `Column_name` = %s"
|
1025
|
+
cursor.execute(sql, (col))
|
1026
|
+
result = cursor.fetchone() # 检查索引是否存在
|
1027
|
+
if not result:
|
1028
|
+
cursor.execute(f"CREATE INDEX index_name ON `{table_name}`(`{col}`)")
|
1029
|
+
connection.commit() # 提交事务
|
1030
|
+
|
1031
|
+
if df_sql:
|
1032
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1033
|
+
print(
|
1034
|
+
f'{now} 正在更新: mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count}, {self.filename}')
|
1035
|
+
engine = create_engine(
|
1036
|
+
f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
|
1037
|
+
# df.to_csv('/Users/xigua/Downloads/mysql.csv', index=False, header=True, encoding='utf-8_sig')
|
1038
|
+
# df.to_excel('/Users/xigua/Downloads/mysql.xlsx', index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
|
1039
|
+
df.to_sql(
|
1040
|
+
name=table_name,
|
1041
|
+
con=engine,
|
1042
|
+
if_exists='append',
|
1043
|
+
index=False,
|
1044
|
+
chunksize=1000
|
1045
|
+
)
|
1046
|
+
if reset_id:
|
1047
|
+
# 6. 重置自增列
|
1048
|
+
try:
|
1049
|
+
# 查询所有复合主键
|
1050
|
+
sql = (
|
1051
|
+
f"SELECT `COLUMN_NAME` AS `PrimaryKey` FROM `information_schema`.`COLUMNS` "
|
1052
|
+
f"WHERE `TABLE_SCHEMA` = '{db_name}'AND `TABLE_NAME` = '{table_name}' AND `COLUMN_KEY` = 'PRI';"
|
1053
|
+
)
|
1054
|
+
cursor.execute(sql)
|
1055
|
+
result = cursor.fetchall() # 复合主键数
|
1056
|
+
if len(result) <= 1: # 如果存在复合主键,则不能直接删除 id 键,其他主键可能不是唯一,会报错
|
1057
|
+
column_name = 'id'
|
1058
|
+
sql = (f'SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS '
|
1059
|
+
f'WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND COLUMN_NAME = %s')
|
1060
|
+
# cursor.execute(f"SHOW COLUMNS FROM `{table_name}` LIKE 'id'")
|
1061
|
+
cursor.execute(sql, (db_name, table_name, column_name))
|
1062
|
+
result = cursor.fetchone()
|
1063
|
+
if result:
|
1064
|
+
# cursor.execute(f"ALTER TABLE `{table_name}` DROP COLUMN id;") # 删除 id 列
|
1065
|
+
sql = f"ALTER TABLE `{table_name}` DROP COLUMN {column_name}" # 删除 id 列
|
1066
|
+
cursor.execute(sql)
|
1067
|
+
cursor.execute(
|
1068
|
+
f"ALTER TABLE `{table_name}` ADD column id INT AUTO_INCREMENT PRIMARY KEY FIRST;")
|
1069
|
+
cursor.execute(f"ALTER TABLE `{table_name}` AUTO_INCREMENT = 1") # 设置自增从 1 开始
|
1070
|
+
# print(f'重置自增id')
|
1071
|
+
else:
|
1072
|
+
print(f'{table_name} 存在复合主键: {[item['PrimaryKey'] for item in result]}, 无法重置自增id')
|
1073
|
+
except Exception as e:
|
1074
|
+
print(f'333 {table_name} {e}')
|
1075
|
+
connection.rollback()
|
1076
|
+
connection.commit() # 提交事务
|
1077
|
+
connection.close()
|
1078
|
+
return
|
1079
|
+
|
1080
|
+
# 5. 移除指定日期范围内的数据,原则上只限于聚合数据使用,原始数据插入时不要设置
|
1081
|
+
if move_insert and '日期' in df.columns.tolist():
|
1082
|
+
# 移除数据
|
1083
|
+
dates = df['日期'].values.tolist()
|
1084
|
+
# print(dates)
|
1085
|
+
dates = [pd.to_datetime(item) for item in dates] # 需要先转换类型才能用 min, max
|
1086
|
+
start_date = pd.to_datetime(min(dates)).strftime('%Y-%m-%d')
|
1087
|
+
end_date = (pd.to_datetime(max(dates)) + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
|
1088
|
+
|
1089
|
+
sql = f"DELETE FROM `{table_name}` WHERE {'日期'} BETWEEN '%s' AND '%s'" % (start_date, end_date)
|
1090
|
+
cursor.execute(sql)
|
1091
|
+
connection.commit()
|
1092
|
+
|
1093
|
+
# 插入数据
|
1094
|
+
engine = create_engine(
|
1095
|
+
f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
|
1096
|
+
df.to_sql(
|
1097
|
+
name=table_name,
|
1098
|
+
con=engine,
|
1099
|
+
if_exists='append',
|
1100
|
+
index=False,
|
1101
|
+
chunksize=1000
|
1102
|
+
)
|
1103
|
+
# 6. 重置自增列
|
1104
|
+
try:
|
1105
|
+
# 查询所有复合主键
|
1106
|
+
sql = (
|
1107
|
+
f"SELECT `COLUMN_NAME` AS `PrimaryKey` FROM `information_schema`.`COLUMNS` "
|
1108
|
+
f"WHERE `TABLE_SCHEMA` = '{db_name}'AND `TABLE_NAME` = '{table_name}' AND `COLUMN_KEY` = 'PRI';"
|
1109
|
+
)
|
1110
|
+
cursor.execute(sql)
|
1111
|
+
result = cursor.fetchall() # 复合主键数
|
1112
|
+
if len(result) <= 1: # 如果存在复合主键,则不能直接删除 id 键,其他主键可能不是唯一,会报错
|
1113
|
+
column_name = 'id'
|
1114
|
+
sql = (f'SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS '
|
1115
|
+
f'WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND COLUMN_NAME = %s')
|
1116
|
+
# cursor.execute(f"SHOW COLUMNS FROM `{table_name}` LIKE 'id'")
|
1117
|
+
cursor.execute(sql, (db_name, table_name, column_name))
|
1118
|
+
result = cursor.fetchone()
|
1119
|
+
if result:
|
1120
|
+
# cursor.execute(f"ALTER TABLE `{table_name}` DROP COLUMN id;") # 删除 id 列
|
1121
|
+
sql = f"ALTER TABLE `{table_name}` DROP COLUMN {column_name}" # 删除 id 列
|
1122
|
+
cursor.execute(sql)
|
1123
|
+
cursor.execute(
|
1124
|
+
f"ALTER TABLE `{table_name}` ADD column id INT AUTO_INCREMENT PRIMARY KEY FIRST;")
|
1125
|
+
cursor.execute(f"ALTER TABLE `{table_name}` AUTO_INCREMENT = 1") # 设置自增从 1 开始
|
1126
|
+
# print(f'重置自增id')
|
1127
|
+
else:
|
1128
|
+
print(f'{table_name} 存在复合主键: {[item['PrimaryKey'] for item in result]}, 无法重置自增id')
|
1129
|
+
except Exception as e:
|
1130
|
+
print(f'333 {table_name} {e}')
|
1131
|
+
connection.rollback()
|
1132
|
+
connection.close()
|
1133
|
+
return
|
1134
|
+
|
1135
|
+
datas = df.to_dict(orient='records')
|
1136
|
+
for data in datas:
|
1137
|
+
# data 是传进来待处理的数据, 不是数据库数据
|
1138
|
+
# data 示例: {'日期': Timestamp('2024-08-27 00:00:00'), '推广费余额': 33299, '品销宝余额': 2930.73, '短信剩余': 67471}
|
1139
|
+
try:
|
1140
|
+
cols = ', '.join(f"`{item}`" for item in data.keys()) # 列名需要转义
|
1141
|
+
# data.update({item: f"{data[item]}" for item in data.keys()}) # 全部值转字符, 不是必须的
|
1142
|
+
values = ', '.join([f'"{item}"' for item in data.values()]) # 值要加引号
|
1143
|
+
condition = []
|
1144
|
+
for k, v in data.items():
|
1145
|
+
condition += [f'`{k}` = "{v}"']
|
1146
|
+
condition = ' AND '.join(condition) # 构建查询条件
|
1147
|
+
# print(condition)
|
1148
|
+
|
1149
|
+
if drop_duplicates: # 查重插入
|
1150
|
+
sql = "SELECT %s FROM %s WHERE %s" % (cols, table_name, condition)
|
1151
|
+
# sql = f"SELECT {cols} FROM `{table_name}` WHERE `创建时间` = '2014-09-19 14:32:33'"
|
1152
|
+
cursor.execute(sql)
|
1153
|
+
result = cursor.fetchall() # 获取查询结果, 有结果返回 list 表示数据已存在(不重复插入),没有则返回空 tuple
|
1154
|
+
# print(result)
|
1155
|
+
if not result: # 数据不存在则插入
|
1156
|
+
sql = f"INSERT INTO `{table_name}` ({cols}) VALUES (%s);" % (values)
|
1157
|
+
# print(sql)
|
1158
|
+
cursor.execute(sql)
|
1159
|
+
# else:
|
1160
|
+
# print(f'重复数据不插入: {condition[:50]}...')
|
1161
|
+
elif icm_update: # 增量更新, 专门用于聚合数据,其他库不要调用
|
1162
|
+
""" 使用增量更新: 需确保 icm_update['主键'] 传进来的列必须是数据表中唯一主键,值不会发生变化且不会重复,否则可能产生覆盖情况 """
|
1163
|
+
sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
|
1164
|
+
cursor.execute(sql, (db_name, {table_name}))
|
1165
|
+
columns = cursor.fetchall()
|
1166
|
+
cols_exist = [col['COLUMN_NAME'] for col in columns] # 数据表的所有列, 返回 list
|
1167
|
+
update_col = [item for item in cols_exist if
|
1168
|
+
item not in icm_update and item != 'id'] # 除了主键外的其他列
|
1169
|
+
|
1170
|
+
# unique_keys 示例: `日期`, `余额`
|
1171
|
+
unique_keys = ', '.join(f"`{item}`" for item in update_col) # 列名需要转义
|
1172
|
+
condition = []
|
1173
|
+
for up_col in icm_update:
|
1174
|
+
condition += [f'`{up_col}` = "{data[up_col]}"']
|
1175
|
+
condition = ' AND '.join(condition) # condition值示例: `品销宝余额` = '2930.73' AND `短信剩余` = '67471'
|
1176
|
+
sql = f"SELECT {unique_keys} FROM `{table_name}` WHERE {condition}"
|
1177
|
+
# print(sql)
|
1178
|
+
# sql = f"SELECT {unique_keys} FROM `{table_name}` WHERE `创建时间` = '2014-09-19 14:32:33'"
|
1179
|
+
cursor.execute(sql)
|
1180
|
+
results = cursor.fetchall() # results 是数据库取出的数据
|
1181
|
+
if results: # 有数据返回,再进行增量检查
|
1182
|
+
for result in results: # results 是数据库数据, data 是传进来的数据
|
1183
|
+
change_col = [] # 发生变化的列名
|
1184
|
+
change_values = [] # 发生变化的数据
|
1185
|
+
for col in update_col:
|
1186
|
+
# 因为 mysql 里面有 decimal 数据类型,要移除末尾的 0 再做比较(df 默认将 5.00 小数截断为 5.0)
|
1187
|
+
df_value = str(data[col])
|
1188
|
+
mysql_value = str(result[col])
|
1189
|
+
if '.' in df_value:
|
1190
|
+
df_value = re.sub(r'0+$', '', df_value)
|
1191
|
+
df_value = re.sub(r'\.$', '', df_value)
|
1192
|
+
if '.' in mysql_value:
|
1193
|
+
mysql_value = re.sub(r'0+$', '', mysql_value)
|
1194
|
+
mysql_value = re.sub(r'\.$', '', mysql_value)
|
1195
|
+
if df_value != mysql_value: # 传进来的数据和数据库比较, 有变化
|
1196
|
+
# print(f'{data['日期']}{data['商品id']}{col} 列的值有变化,{str(data[col])} != {str(result[col])}')
|
1197
|
+
change_values += [f"`{col}` = \"{str(data[col])}\""]
|
1198
|
+
change_col.append(col)
|
1199
|
+
not_change_col = [item for item in update_col if item not in change_col]
|
1200
|
+
# change_values 是 df 传进来且和数据库对比后,发生了变化的数据,值示例: [`品销宝余额` = '9999.0', `短信剩余` = '888']
|
1201
|
+
if change_values: # change_values 有数据返回,表示值需要更新
|
1202
|
+
if not_change_col:
|
1203
|
+
not_change_values = [f'`{col}` = "{str(data[col])}"' for col in not_change_col]
|
1204
|
+
not_change_values = ' AND '.join(
|
1205
|
+
not_change_values) # 示例: `短信剩余` = '888' AND `test1` = '93'
|
1206
|
+
# print(change_values, not_change_values)
|
1207
|
+
condition += f' AND {not_change_values}' # 重新构建完整的查询条件,将未发生变化的列加进查询条件
|
1208
|
+
change_values = ', '.join(f"{item}" for item in change_values) # 注意这里 item 外面没有反引号
|
1209
|
+
sql = "UPDATE `%s` SET %s WHERE %s" % (table_name, change_values, condition)
|
1210
|
+
# print(sql)
|
1211
|
+
cursor.execute(sql)
|
1212
|
+
else: # 没有数据返回,则直接插入数据
|
1213
|
+
sql = f"INSERT INTO `{table_name}` ({cols}) VALUES ({values});"
|
1214
|
+
cursor.execute(sql)
|
1215
|
+
else:
|
1216
|
+
sql = f"INSERT INTO `{table_name}` ({cols}) VALUES (%s);" % (values)
|
1217
|
+
cursor.execute(sql)
|
1218
|
+
except Exception as e:
|
1219
|
+
pass
|
1220
|
+
# print(data)
|
1221
|
+
# print(values)
|
1222
|
+
# print(f'mysql -> df_to_mysql 报错: {e}, {self.filename}')
|
1223
|
+
# breakpoint()
|
1224
|
+
|
1225
|
+
if reset_id:
|
1226
|
+
# 6. 重置自增列
|
1227
|
+
try:
|
1228
|
+
# 查询所有复合主键
|
1229
|
+
sql = (
|
1230
|
+
f"SELECT `COLUMN_NAME` AS `PrimaryKey` FROM `information_schema`.`COLUMNS` "
|
1231
|
+
f"WHERE `TABLE_SCHEMA` = '{db_name}'AND `TABLE_NAME` = '{table_name}' AND `COLUMN_KEY` = 'PRI';"
|
1232
|
+
)
|
1233
|
+
cursor.execute(sql)
|
1234
|
+
result = cursor.fetchall() # 复合主键数
|
1235
|
+
if len(result) <= 1: # 如果存在复合主键,则不能直接删除 id 键,其他主键可能不是唯一,会报错
|
1236
|
+
column_name = 'id'
|
1237
|
+
sql = (f'SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS '
|
1238
|
+
f'WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND COLUMN_NAME = %s')
|
1239
|
+
# cursor.execute(f"SHOW COLUMNS FROM `{table_name}` LIKE 'id'")
|
1240
|
+
cursor.execute(sql, (db_name, table_name, column_name))
|
1241
|
+
result = cursor.fetchone()
|
1242
|
+
if result:
|
1243
|
+
# cursor.execute(f"ALTER TABLE `{table_name}` DROP COLUMN id;") # 删除 id 列
|
1244
|
+
sql = f"ALTER TABLE `{table_name}` DROP COLUMN {column_name}" # 删除 id 列
|
1245
|
+
cursor.execute(sql)
|
1246
|
+
cursor.execute(
|
1247
|
+
f"ALTER TABLE `{table_name}` ADD column id INT AUTO_INCREMENT PRIMARY KEY FIRST;")
|
1248
|
+
cursor.execute(f"ALTER TABLE `{table_name}` AUTO_INCREMENT = 1") # 设置自增从 1 开始
|
1249
|
+
# print(f'重置自增id')
|
1250
|
+
else:
|
1251
|
+
print(f'{table_name} 存在复合主键: {[item['PrimaryKey'] for item in result]}, 无法重置自增id')
|
1252
|
+
except Exception as e:
|
1253
|
+
print(f'333 {table_name} {e}')
|
1254
|
+
connection.rollback()
|
1255
|
+
connection.commit() # 提交事务
|
1256
|
+
connection.close()
|
1257
|
+
|
1258
|
+
@try_except
|
1259
|
+
def read_doc_data(self, table_name, db_name='pdf文件', column='文件名', filename=None, save_path='/Users/xigua/Downloads'):
|
1260
|
+
"""
|
1261
|
+
db_name:
|
1262
|
+
table_name:
|
1263
|
+
column: 读取哪一列
|
1264
|
+
filename: 文件名称
|
1265
|
+
save_path: 保存位置
|
1266
|
+
"""
|
1267
|
+
if not filename:
|
1268
|
+
print(f'未指定文件名: filename')
|
1269
|
+
return
|
1270
|
+
# connection = pymysql.connect(**self.config) # 连接数据库
|
1271
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
1272
|
+
if not connection:
|
1273
|
+
return
|
1274
|
+
# try:
|
1275
|
+
with connection.cursor() as cursor:
|
1276
|
+
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
1277
|
+
database_exists = cursor.fetchone()
|
1278
|
+
if not database_exists:
|
1279
|
+
print(f"Database {db_name} 数据库不存在")
|
1280
|
+
return
|
1281
|
+
self.config.update({'database': db_name})
|
1282
|
+
# connection = pymysql.connect(**self.config) # 重新连接数据库
|
1283
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
1284
|
+
if not connection:
|
1285
|
+
return
|
1286
|
+
with connection.cursor() as cursor:
|
1287
|
+
# 1. 查询表
|
1288
|
+
sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
|
1289
|
+
cursor.execute(sql, (table_name))
|
1290
|
+
if not cursor.fetchone():
|
1291
|
+
print(f'{table_name} -> 数据表不存在')
|
1292
|
+
return
|
1293
|
+
|
1294
|
+
# 读取数据
|
1295
|
+
condition = f'`{column}` = "{filename}"'
|
1296
|
+
sql = f"SELECT `{column}`, `数据主体` FROM `{table_name}` WHERE {condition}"
|
1297
|
+
cursor.execute(sql)
|
1298
|
+
results = cursor.fetchall()
|
1299
|
+
if results:
|
1300
|
+
for result in results:
|
1301
|
+
# 将二进制数据写入到文件
|
1302
|
+
with open(os.path.join(save_path, filename), 'wb') as f:
|
1303
|
+
f.write(result['数据主体'])
|
1304
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1305
|
+
print(f'{now} 写入本地文件: ({self.host}:{self.port}) {db_name}/{table_name} -> {os.path.join(save_path, filename)}')
|
1306
|
+
connection.close()
|
1307
|
+
|
1308
|
+
def read_mysql(self, table_name, start_date, end_date, db_name='远程数据源', date_name='日期'):
|
1309
|
+
""" 读取指定数据表,可指定日期范围,返回结果: df """
|
1310
|
+
start_date = pd.to_datetime(start_date).strftime('%Y-%m-%d')
|
1311
|
+
end_date = pd.to_datetime(end_date).strftime('%Y-%m-%d')
|
1312
|
+
df = pd.DataFrame()
|
1313
|
+
|
1314
|
+
# connection = pymysql.connect(**self.config) # 连接数据库
|
1315
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
1316
|
+
if not connection:
|
1317
|
+
return
|
1318
|
+
try:
|
1319
|
+
with connection.cursor() as cursor:
|
1320
|
+
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
1321
|
+
database_exists = cursor.fetchone()
|
1322
|
+
if not database_exists:
|
1323
|
+
print(f"Database {db_name} 数据库不存在")
|
1324
|
+
return df
|
1325
|
+
else:
|
1326
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1327
|
+
print(f'{now} mysql 正在查询表: {table_name}, 范围: {start_date}~{end_date}')
|
1328
|
+
except:
|
1329
|
+
return df
|
1330
|
+
finally:
|
1331
|
+
connection.close() # 断开连接
|
1332
|
+
|
1333
|
+
before_time = time.time()
|
1334
|
+
# 读取数据
|
1335
|
+
self.config.update({'database': db_name})
|
1336
|
+
# connection = pymysql.connect(**self.config) # 重新连接数据库
|
1337
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
1338
|
+
if not connection:
|
1339
|
+
return
|
1340
|
+
try:
|
1341
|
+
with connection.cursor() as cursor:
|
1342
|
+
# 获取指定日期范围的数据
|
1343
|
+
sql = f"SELECT * FROM `{db_name}`.`{table_name}` WHERE `{date_name}` BETWEEN '%s' AND '%s'" % (start_date, end_date)
|
1344
|
+
cursor.execute(sql)
|
1345
|
+
rows = cursor.fetchall() # 获取查询结果
|
1346
|
+
columns = [desc[0] for desc in cursor.description]
|
1347
|
+
df = pd.DataFrame(rows, columns=columns) # 转为 df
|
1348
|
+
except Exception as e:
|
1349
|
+
print(f'{e} {db_name} -> {table_name} 表不存在')
|
1350
|
+
return df
|
1351
|
+
finally:
|
1352
|
+
connection.close()
|
1353
|
+
|
1354
|
+
if len(df) == 0:
|
1355
|
+
print(f'database: {db_name}, table: {table_name} 查询的数据为空')
|
1356
|
+
else:
|
1357
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1358
|
+
cost_time = int(time.time() - before_time)
|
1359
|
+
if cost_time < 1:
|
1360
|
+
cost_time = round(time.time() - before_time, 2)
|
1361
|
+
print(f'{now} mysql ({self.host}) 表: {table_name} 获取数据长度: {len(df)}, 用时: {cost_time} 秒')
|
1362
|
+
return df
|
1363
|
+
|
1364
|
+
def upload_pandas(self, update_path, db_name, days=None):
|
1365
|
+
"""
|
1366
|
+
专门用来上传 pandas数据源的全部文件
|
1367
|
+
db_name: 数据库名: pandas数据源
|
1368
|
+
update_path: pandas数据源所在路径
|
1369
|
+
days: 更新近期数据,单位: 天, 不设置则全部更新
|
1370
|
+
"""
|
1371
|
+
if days:
|
1372
|
+
today = datetime.date.today()
|
1373
|
+
start_date = pd.to_datetime(today - datetime.timedelta(days=days))
|
1374
|
+
else:
|
1375
|
+
start_date = pd.to_datetime('2000-01-01')
|
1376
|
+
|
1377
|
+
root_files = os.listdir(update_path)
|
1378
|
+
for root_file in root_files:
|
1379
|
+
if '其他数据' in root_file or '年.csv' in root_file or '京东数据集' in root_file:
|
1380
|
+
continue # 跳过的文件夹
|
1381
|
+
f_path = os.path.join(update_path, root_file)
|
1382
|
+
|
1383
|
+
if os.path.isdir(f_path):
|
1384
|
+
for root, dirs, files in os.walk(f_path, topdown=False):
|
1385
|
+
for name in files:
|
1386
|
+
if name.endswith('.csv') and 'baidu' not in name:
|
1387
|
+
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
1388
|
+
# if '日期' not in df.columns.tolist():
|
1389
|
+
# now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1390
|
+
# print(f'{now} {root_file} 缺少日期列, 不支持上传 mysql')
|
1391
|
+
# continue
|
1392
|
+
if '日期' in df.columns.tolist():
|
1393
|
+
df['日期'] = df['日期'].apply(lambda x: pd.to_datetime(x) if x else x)
|
1394
|
+
df = df[df['日期'] >= start_date]
|
1395
|
+
if len(df) == 0:
|
1396
|
+
continue
|
1397
|
+
self.df_to_mysql(df=df, db_name=db_name, table_name=root_file)
|
1398
|
+
elif os.path.isfile(f_path):
|
1399
|
+
if f_path.endswith('.csv') and 'baidu' not in f_path:
|
1400
|
+
df = pd.read_csv(f_path, encoding='utf-8_sig', header=0, na_filter=False)
|
1401
|
+
# if '日期' not in df.columns.tolist():
|
1402
|
+
# now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1403
|
+
# print(f'{now} {root_file} 缺少日期列, 不支持上传 mysql')
|
1404
|
+
# continue
|
1405
|
+
if '日期' not in df.columns.tolist():
|
1406
|
+
df['日期'] = df['日期'].apply(lambda x: pd.to_datetime(x) if x else x)
|
1407
|
+
df = df[df['日期'] >= start_date]
|
1408
|
+
if len(df) == 0:
|
1409
|
+
continue
|
1410
|
+
table = f'{os.path.splitext(root_file)[0]}_f' # 这里定义了文件表会加 _f 后缀
|
1411
|
+
self.df_to_mysql(df=df, db_name=db_name, table_name=table)
|
1412
|
+
|
1413
|
+
|
1414
|
+
class OptimizeDatas:
|
1415
|
+
"""
|
1416
|
+
数据维护 删除 mysql 的冗余数据
|
1417
|
+
更新过程:
|
1418
|
+
1. 读取所有数据表
|
1419
|
+
2. 遍历表, 遍历列, 如果存在日期列则按天遍历所有日期, 不存在则全表读取
|
1420
|
+
3. 按天删除所有冗余数据(存在日期列时)
|
1421
|
+
tips: 查找冗余数据的方式是创建一个临时迭代器, 逐行读取数据并添加到迭代器, 出现重复时将重复数据的 id 添加到临时列表, 按列表 id 执行删除
|
1422
|
+
"""
|
1423
|
+
def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
|
1424
|
+
self.username = username
|
1425
|
+
self.password = password
|
1426
|
+
self.host = host
|
1427
|
+
self.port = port # 默认端口, 此后可能更新,不作为必传参数
|
1428
|
+
self.charset = charset
|
1429
|
+
self.config = {
|
1430
|
+
'host': self.host,
|
1431
|
+
'port': int(self.port),
|
1432
|
+
'user': self.username,
|
1433
|
+
'password': self.password,
|
1434
|
+
'charset': self.charset, # utf8mb4 支持存储四字节的UTF-8字符集
|
1435
|
+
'cursorclass': pymysql.cursors.DictCursor,
|
1436
|
+
}
|
1437
|
+
self.db_name_lists: list = [] # 更新多个数据库 删除重复数据
|
1438
|
+
self.db_name = None
|
1439
|
+
self.days: int = 63 # 对近 N 天的数据进行排重
|
1440
|
+
self.end_date = None
|
1441
|
+
self.start_date = None
|
1442
|
+
self.connection = None
|
1443
|
+
|
1444
|
+
@staticmethod
|
1445
|
+
def try_except(func): # 在类内部定义一个异常处理方法
|
1446
|
+
|
1447
|
+
@wraps(func)
|
1448
|
+
def wrapper(*args, **kwargs):
|
1449
|
+
try:
|
1450
|
+
return func(*args, **kwargs)
|
1451
|
+
except Exception as e:
|
1452
|
+
print(f'{func.__name__}, {e}') # 将异常信息返回
|
1453
|
+
with open(error_file, 'a') as f:
|
1454
|
+
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
1455
|
+
f.write(f'\n{now} \n')
|
1456
|
+
f.write(f'函数注释内容(用于定位函数): {func.__doc__} \n')
|
1457
|
+
# f.write(f'报错的文件:\n{e.__traceback__.tb_frame.f_globals["__file__"]}\n') # 发生异常所在的文件
|
1458
|
+
traceback.print_exc(file=open(error_file, 'a')) # 返回完整的堆栈信息
|
1459
|
+
print(f'更多信息请查看日志文件: {error_file}')
|
1460
|
+
|
1461
|
+
return wrapper
|
1462
|
+
|
1463
|
+
def keep_connect(self, _db_name, _config, max_try: int=10):
|
1464
|
+
attempts = 1
|
1465
|
+
while attempts <= max_try:
|
1466
|
+
try:
|
1467
|
+
connection = pymysql.connect(**_config) # 连接数据库
|
1468
|
+
return connection
|
1469
|
+
except Exception as e:
|
1470
|
+
print(f'连接失败,正在重试: {attempts}/{max_try} {e}')
|
1471
|
+
attempts += 1
|
1472
|
+
time.sleep(30)
|
1473
|
+
print(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
|
1474
|
+
return None
|
1475
|
+
|
1476
|
+
def optimize_list(self):
|
1477
|
+
"""
|
1478
|
+
更新多个数据库 移除冗余数据
|
1479
|
+
需要设置 self.db_name_lists
|
1480
|
+
"""
|
1481
|
+
if not self.db_name_lists:
|
1482
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1483
|
+
print(f'{now} 尚未设置参数: self.db_name_lists')
|
1484
|
+
return
|
1485
|
+
for db_name in self.db_name_lists:
|
1486
|
+
self.db_name = db_name
|
1487
|
+
self.optimize()
|
1488
|
+
|
1489
|
+
def optimize(self, except_key=['更新时间']):
|
1490
|
+
""" 更新一个数据库 移除冗余数据 """
|
1491
|
+
if not self.db_name:
|
1492
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1493
|
+
print(f'{now} 尚未设置参数: self.db_name')
|
1494
|
+
return
|
1495
|
+
tables = self.table_list(db_name=self.db_name)
|
1496
|
+
if not tables:
|
1497
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1498
|
+
print(f'{now} {self.db_name} -> 数据表不存在')
|
1499
|
+
return
|
1500
|
+
|
1501
|
+
# 日期初始化
|
1502
|
+
if not self.end_date:
|
1503
|
+
self.end_date = pd.to_datetime(datetime.datetime.today())
|
1504
|
+
else:
|
1505
|
+
self.end_date = pd.to_datetime(self.end_date)
|
1506
|
+
if self.days:
|
1507
|
+
self.start_date = pd.to_datetime(self.end_date - datetime.timedelta(days=self.days))
|
1508
|
+
if not self.start_date:
|
1509
|
+
self.start_date = self.end_date
|
1510
|
+
else:
|
1511
|
+
self.start_date = pd.to_datetime(self.start_date)
|
1512
|
+
start_date_before = self.start_date
|
1513
|
+
end_date_before = self.end_date
|
1514
|
+
|
1515
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1516
|
+
print(f'{now} mysql({self.host}: {self.port}) {self.db_name} 数据库优化中(日期长度: {self.days} 天)...')
|
1517
|
+
for table_dict in tables:
|
1518
|
+
for key, table_name in table_dict.items():
|
1519
|
+
# if '店铺指标' not in table_name:
|
1520
|
+
# continue
|
1521
|
+
self.config.update({'database': self.db_name}) # 添加更新 config 字段
|
1522
|
+
# self.connection = pymysql.connect(**self.config)
|
1523
|
+
self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
|
1524
|
+
if not self.connection:
|
1525
|
+
return
|
1526
|
+
with self.connection.cursor() as cursor:
|
1527
|
+
sql = f"SELECT 1 FROM `{table_name}` LIMIT 1"
|
1528
|
+
# print(sql)
|
1529
|
+
cursor.execute(sql)
|
1530
|
+
result = cursor.fetchone()
|
1531
|
+
if not result:
|
1532
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1533
|
+
print(f'{now} 数据表: {table_name}, 数据长度为 0')
|
1534
|
+
continue # 检查数据表是否为空
|
1535
|
+
|
1536
|
+
cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`") # 查询数据表的列信息
|
1537
|
+
columns = cursor.fetchall()
|
1538
|
+
date_exist = False
|
1539
|
+
for col in columns: # 遍历列信息,检查是否存在类型为日期的列
|
1540
|
+
if col['Field'] == '日期' and (col['Type'] == 'date' or col['Type'].startswith('datetime')):
|
1541
|
+
date_exist = True
|
1542
|
+
break
|
1543
|
+
if date_exist: # 存在日期列
|
1544
|
+
sql_max = f"SELECT MAX(日期) AS max_date FROM `{table_name}`"
|
1545
|
+
sql_min = f"SELECT MIN(日期) AS min_date FROM `{table_name}`"
|
1546
|
+
cursor.execute(sql_max)
|
1547
|
+
max_result = cursor.fetchone()
|
1548
|
+
cursor.execute(sql_min)
|
1549
|
+
min_result = cursor.fetchone()
|
1550
|
+
# print(min_result['min_date'], max_result['max_date'])
|
1551
|
+
# 匹配修改为合适的起始和结束日期
|
1552
|
+
if self.start_date < pd.to_datetime(min_result['min_date']):
|
1553
|
+
self.start_date = pd.to_datetime(min_result['min_date'])
|
1554
|
+
if self.end_date > pd.to_datetime(max_result['max_date']):
|
1555
|
+
self.end_date = pd.to_datetime(max_result['max_date'])
|
1556
|
+
dates_list = self.day_list(start_date=self.start_date, end_date=self.end_date)
|
1557
|
+
# dates_list 是日期列表
|
1558
|
+
for date in dates_list:
|
1559
|
+
self.delete_duplicate(table_name=table_name, date=date, except_key=except_key)
|
1560
|
+
self.start_date = start_date_before # 重置,不然日期错乱
|
1561
|
+
self.end_date = end_date_before
|
1562
|
+
else: # 不存在日期列的情况
|
1563
|
+
self.delete_duplicate2(table_name=table_name, except_key=except_key)
|
1564
|
+
|
1565
|
+
# 6. 重置自增列
|
1566
|
+
try:
|
1567
|
+
# 查询所有复合主键
|
1568
|
+
sql = (
|
1569
|
+
f"SELECT `COLUMN_NAME` AS `PrimaryKey` FROM `information_schema`.`COLUMNS` "
|
1570
|
+
f"WHERE `TABLE_SCHEMA` = '{self.db_name}'AND `TABLE_NAME` = '{table_name}' AND `COLUMN_KEY` = 'PRI';"
|
1571
|
+
)
|
1572
|
+
cursor.execute(sql)
|
1573
|
+
result = cursor.fetchall() # 复合主键数
|
1574
|
+
if len(result) <= 1: # 如果存在复合主键,则不能直接删除 id 键,其他主键可能不是唯一,会报错
|
1575
|
+
column_name = 'id'
|
1576
|
+
sql = (f'SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS '
|
1577
|
+
f'WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND COLUMN_NAME = %s')
|
1578
|
+
# cursor.execute(f"SHOW COLUMNS FROM `{table_name}` LIKE 'id'")
|
1579
|
+
cursor.execute(sql, (self.db_name, table_name, column_name))
|
1580
|
+
result = cursor.fetchone()
|
1581
|
+
if result:
|
1582
|
+
# cursor.execute(f"ALTER TABLE `{table_name}` DROP COLUMN id;") # 删除 id 列
|
1583
|
+
sql = f"ALTER TABLE `{table_name}` DROP COLUMN {column_name}" # 删除 id 列
|
1584
|
+
cursor.execute(sql)
|
1585
|
+
cursor.execute(
|
1586
|
+
f"ALTER TABLE `{table_name}` ADD column id INT AUTO_INCREMENT PRIMARY KEY FIRST;")
|
1587
|
+
cursor.execute(f"ALTER TABLE `{table_name}` AUTO_INCREMENT = 1") # 设置自增从 1 开始
|
1588
|
+
# print(f'重置自增id')
|
1589
|
+
else:
|
1590
|
+
print(f'{table_name} 存在复合主键: {[item['PrimaryKey'] for item in result]}, 无法重置自增id')
|
1591
|
+
except Exception as e:
|
1592
|
+
print(f'333 {table_name} {e}')
|
1593
|
+
self.connection.rollback()
|
1594
|
+
self.connection.close()
|
1595
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1596
|
+
print(f'{now} mysql({self.host}: {self.port}) {self.db_name} 数据库优化完成!')
|
1597
|
+
|
1598
|
+
def delete_duplicate(self, table_name, date, except_key=['更新时间']):
|
1599
|
+
datas = self.table_datas(db_name=self.db_name, table_name=str(table_name), date=date)
|
1600
|
+
if not datas:
|
1601
|
+
return
|
1602
|
+
duplicate_id = [] # 出现重复的 id
|
1603
|
+
all_datas = [] # 迭代器
|
1604
|
+
for data in datas:
|
1605
|
+
for e_key in except_key:
|
1606
|
+
if e_key in data.keys(): # 在检查重复数据时,不包含 更新时间 字段
|
1607
|
+
del data[e_key]
|
1608
|
+
try:
|
1609
|
+
delete_id = data['id']
|
1610
|
+
del data['id']
|
1611
|
+
data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
|
1612
|
+
if data in all_datas: # 数据出现重复时
|
1613
|
+
duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
|
1614
|
+
continue
|
1615
|
+
all_datas.append(data) # 数据没有重复
|
1616
|
+
except Exception as e:
|
1617
|
+
print(f'{table_name} 函数: mysql - > OptimizeDatas -> delete_duplicate -> {e}')
|
1618
|
+
del all_datas
|
1619
|
+
|
1620
|
+
if not duplicate_id: # 如果没有重复数据,则跳过该数据表
|
1621
|
+
return
|
1622
|
+
|
1623
|
+
try:
|
1624
|
+
with self.connection.cursor() as cursor:
|
1625
|
+
placeholders = ', '.join(['%s'] * len(duplicate_id))
|
1626
|
+
# 移除冗余数据
|
1627
|
+
sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
|
1628
|
+
cursor.execute(sql, duplicate_id)
|
1629
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1630
|
+
print(f"{now} {table_name} -> {date.strftime('%Y-%m-%d')} before: {len(datas)}, remove: {cursor.rowcount}")
|
1631
|
+
self.connection.commit() # 提交事务
|
1632
|
+
except Exception as e:
|
1633
|
+
print(f'{self.db_name}/{table_name}, {e}')
|
1634
|
+
self.connection.rollback() # 异常则回滚
|
1635
|
+
|
1636
|
+
def delete_duplicate2(self, table_name, except_key=['更新时间']):
|
1637
|
+
with self.connection.cursor() as cursor:
|
1638
|
+
sql = f"SELECT * FROM `{table_name}`" # 如果不包含日期列,则获取全部数据
|
1639
|
+
cursor.execute(sql)
|
1640
|
+
datas = cursor.fetchall()
|
1641
|
+
if not datas:
|
1642
|
+
return
|
1643
|
+
duplicate_id = [] # 出现重复的 id
|
1644
|
+
all_datas = [] # 迭代器
|
1645
|
+
for data in datas:
|
1646
|
+
for e_key in except_key:
|
1647
|
+
if e_key in data.keys(): # 在检查重复数据时,不包含 更新时间 字段
|
1648
|
+
del data[e_key]
|
1649
|
+
delete_id = data['id']
|
1650
|
+
del data['id']
|
1651
|
+
data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
|
1652
|
+
if data in all_datas: # 数据出现重复时
|
1653
|
+
duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
|
1654
|
+
continue
|
1655
|
+
all_datas.append(data) # 数据没有重复
|
1656
|
+
del all_datas
|
1657
|
+
|
1658
|
+
if not duplicate_id: # 如果没有重复数据,则跳过该数据表
|
1659
|
+
return
|
1660
|
+
|
1661
|
+
try:
|
1662
|
+
with self.connection.cursor() as cursor:
|
1663
|
+
placeholders = ', '.join(['%s'] * len(duplicate_id))
|
1664
|
+
# 移除冗余数据
|
1665
|
+
sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
|
1666
|
+
cursor.execute(sql, duplicate_id)
|
1667
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1668
|
+
print(f"{now} {table_name} -> before: {len(datas)}, "
|
1669
|
+
f"remove: {cursor.rowcount}")
|
1670
|
+
self.connection.commit() # 提交事务
|
1671
|
+
except Exception as e:
|
1672
|
+
print(f'{self.db_name}/{table_name}, {e}')
|
1673
|
+
self.connection.rollback() # 异常则回滚
|
1674
|
+
|
1675
|
+
def database_list(self):
|
1676
|
+
""" 获取所有数据库 """
|
1677
|
+
# connection = pymysql.connect(**self.config) # 连接数据库
|
1678
|
+
connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
|
1679
|
+
if not connection:
|
1680
|
+
return
|
1681
|
+
with connection.cursor() as cursor:
|
1682
|
+
cursor.execute("SHOW DATABASES")
|
1683
|
+
databases = cursor.fetchall() # 获取所有数据库的结果
|
1684
|
+
connection.close()
|
1685
|
+
return databases
|
1686
|
+
|
1687
|
+
def table_list(self, db_name):
|
1688
|
+
""" 获取指定数据库的所有数据表 """
|
1689
|
+
# connection = pymysql.connect(**self.config) # 连接数据库
|
1690
|
+
connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
|
1691
|
+
if not connection:
|
1692
|
+
return
|
1693
|
+
try:
|
1694
|
+
with connection.cursor() as cursor:
|
1695
|
+
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
1696
|
+
database_exists = cursor.fetchone()
|
1697
|
+
if not database_exists:
|
1698
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1699
|
+
print(f'{now} {db_name}: 数据表不存在!')
|
1700
|
+
return
|
1701
|
+
except Exception as e:
|
1702
|
+
print(f'002 {e}')
|
1703
|
+
return
|
1704
|
+
finally:
|
1705
|
+
connection.close() # 断开连接
|
1706
|
+
|
1707
|
+
self.config.update({'database': db_name}) # 添加更新 config 字段
|
1708
|
+
# connection = pymysql.connect(**self.config) # 重新连接数据库
|
1709
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
1710
|
+
if not connection:
|
1711
|
+
return
|
1712
|
+
with connection.cursor() as cursor:
|
1713
|
+
cursor.execute("SHOW TABLES")
|
1714
|
+
tables = cursor.fetchall() # 获取所有数据表
|
1715
|
+
connection.close()
|
1716
|
+
return tables
|
1717
|
+
|
1718
|
+
def table_datas(self, db_name, table_name, date):
|
1719
|
+
"""
|
1720
|
+
获取指定数据表的数据, 按天获取
|
1721
|
+
"""
|
1722
|
+
self.config.update({'database': db_name}) # 添加更新 config 字段
|
1723
|
+
# connection = pymysql.connect(**self.config)
|
1724
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=10)
|
1725
|
+
if not connection:
|
1726
|
+
return
|
1727
|
+
try:
|
1728
|
+
with connection.cursor() as cursor:
|
1729
|
+
sql = f"SELECT * FROM `{table_name}` WHERE {'日期'} BETWEEN '%s' AND '%s'" % (date, date)
|
1730
|
+
cursor.execute(sql)
|
1731
|
+
results = cursor.fetchall()
|
1732
|
+
except Exception as e:
|
1733
|
+
print(f'001 {e}')
|
1734
|
+
finally:
|
1735
|
+
connection.close()
|
1736
|
+
return results
|
1737
|
+
|
1738
|
+
def day_list(self, start_date, end_date):
|
1739
|
+
start_date = pd.to_datetime(start_date)
|
1740
|
+
end_date = pd.to_datetime(end_date)
|
1741
|
+
date_list = []
|
1742
|
+
while start_date <= end_date:
|
1743
|
+
date_list.append(pd.to_datetime(start_date.date()))
|
1744
|
+
start_date += datetime.timedelta(days=1)
|
1745
|
+
return date_list
|
1746
|
+
|
1747
|
+
def rename_column(self):
|
1748
|
+
""" 批量修改数据库的列名 """
|
1749
|
+
"""
|
1750
|
+
# for db_name in ['京东数据2', '推广数据2', '市场数据2', '生意参谋2', '生意经2', '属性设置2',]:
|
1751
|
+
# s = OptimizeDatas(username=username, password=password, host=host, port=port)
|
1752
|
+
# s.db_name = db_name
|
1753
|
+
# s.rename_column()
|
1754
|
+
"""
|
1755
|
+
tables = self.table_list(db_name=self.db_name)
|
1756
|
+
for table_dict in tables:
|
1757
|
+
for key, table_name in table_dict.items():
|
1758
|
+
self.config.update({'database': self.db_name}) # 添加更新 config 字段
|
1759
|
+
# self.connection = pymysql.connect(**self.config)
|
1760
|
+
self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=10)
|
1761
|
+
if not self.connection:
|
1762
|
+
return
|
1763
|
+
with self.connection.cursor() as cursor:
|
1764
|
+
cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`") # 查询数据表的列信息
|
1765
|
+
columns = cursor.fetchall()
|
1766
|
+
columns = [{column['Field']: column['Type']} for column in columns]
|
1767
|
+
for column in columns:
|
1768
|
+
for key, value in column.items():
|
1769
|
+
if key.endswith('_'):
|
1770
|
+
new_name = re.sub(r'_+$', '', key)
|
1771
|
+
sql = f"ALTER TABLE `{table_name}` CHANGE COLUMN {key} {new_name} {value}"
|
1772
|
+
cursor.execute(sql)
|
1773
|
+
self.connection.commit()
|
1774
|
+
if self.connection:
|
1775
|
+
self.connection.close()
|
1776
|
+
|
1777
|
+
|
1778
|
+
def year_month_day_bak(start_date, end_date):
|
1779
|
+
"""
|
1780
|
+
使用date_range函数和DataFrame来获取从start_date至end_date之间的所有年月日
|
1781
|
+
calendar.monthrange: 获取当月第一个工作日的星期值(0,6) 以及当月天数
|
1782
|
+
"""
|
1783
|
+
# 替换年月日中的日, 以便即使传入当月日期也有返回值
|
1784
|
+
try:
|
1785
|
+
start_date = f'{pd.to_datetime(start_date).year}-{pd.to_datetime(start_date).month}-01'
|
1786
|
+
except Exception as e:
|
1787
|
+
print(e)
|
1788
|
+
return []
|
1789
|
+
# 使用pandas的date_range创建一个日期范围,频率为'MS'代表每月开始
|
1790
|
+
date_range = pd.date_range(start=start_date, end=end_date, freq='MS')
|
1791
|
+
# 转换格式
|
1792
|
+
year_months = date_range.strftime('%Y-%m').drop_duplicates().sort_values()
|
1793
|
+
|
1794
|
+
results = []
|
1795
|
+
for year_month in year_months:
|
1796
|
+
year = re.findall(r'(\d{4})', year_month)[0]
|
1797
|
+
month = re.findall(r'\d{4}-(\d{2})', year_month)[0]
|
1798
|
+
s, d = calendar.monthrange(int(year), int(month))
|
1799
|
+
results.append({'起始日期': f'{year_month}-01', '结束日期': f'{year_month}-{d}'})
|
1800
|
+
|
1801
|
+
return results # start_date至end_date之间的所有年月日
|
1802
|
+
|
1803
|
+
|
1804
|
+
if __name__ == '__main__':
|
1805
|
+
conf = myconfig.main()
|
1806
|
+
data = conf['Windows']['xigua_lx']['mysql']['local']
|
1807
|
+
username, password, host, port = data['username'], data['password'], data['host'], data['port']
|
1808
|
+
print(username, password, host, port)
|