mdbq 3.2.11__py3-none-any.whl → 3.2.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/aggregation.py +90 -6
- mdbq/aggregation/query_data.py +5 -4
- mdbq/mysql/mysql.py +267 -32
- {mdbq-3.2.11.dist-info → mdbq-3.2.12.dist-info}/METADATA +1 -1
- {mdbq-3.2.11.dist-info → mdbq-3.2.12.dist-info}/RECORD +7 -7
- {mdbq-3.2.11.dist-info → mdbq-3.2.12.dist-info}/WHEEL +0 -0
- {mdbq-3.2.11.dist-info → mdbq-3.2.12.dist-info}/top_level.txt +0 -0
mdbq/aggregation/aggregation.py
CHANGED
@@ -1324,7 +1324,91 @@ def cut_as_year_month(as_month=False):
|
|
1324
1324
|
df.to_csv(os.path.join(root, new_name), encoding='utf-8_sig', index=False, header=True)
|
1325
1325
|
|
1326
1326
|
|
1327
|
+
def doc_to_sql(write_data=False, read_data=False):
|
1328
|
+
if not write_data and not read_data:
|
1329
|
+
return
|
1330
|
+
# filename = '关于做好2024年世界互联网大会乌镇峰会期间寄递渠道安全保障工作的通知.pdf'
|
1331
|
+
path = '/Users/xigua/数据中心/微信pdf文件/2024-10'
|
1332
|
+
|
1333
|
+
if not os.path.isdir(path):
|
1334
|
+
print(f'不存在的文件夹: {path}')
|
1335
|
+
return
|
1336
|
+
m_engine = mysql.MysqlUpload(
|
1337
|
+
username=username,
|
1338
|
+
password=password,
|
1339
|
+
host=host,
|
1340
|
+
port=port,
|
1341
|
+
charset='utf8mb4'
|
1342
|
+
)
|
1343
|
+
if write_data:
|
1344
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
1345
|
+
for name in files:
|
1346
|
+
if '~$' in name or '.DS' in name or '.localized' in name or 'baidu' in name:
|
1347
|
+
continue
|
1348
|
+
if name.endswith('.pdf') or name.endswith('.pptx'):
|
1349
|
+
file_size = os.stat(os.path.join(root, name)).st_size
|
1350
|
+
if file_size > 1024 * 1024 * 1024:
|
1351
|
+
file_size = file_size / 1024 / 1024 / 1024
|
1352
|
+
file_size = f'{file_size:.2f} GB'
|
1353
|
+
elif file_size > 1024 * 1024:
|
1354
|
+
file_size = file_size / 1024 / 1024
|
1355
|
+
file_size = f'{file_size:.2f} MB'
|
1356
|
+
else:
|
1357
|
+
file_size = file_size / 1024
|
1358
|
+
file_size = f'{file_size:.2f} KB'
|
1359
|
+
mod_time = os.path.getmtime(os.path.join(root, name))
|
1360
|
+
local_time = time.localtime(mod_time)
|
1361
|
+
mod_time_formatted = time.strftime('%Y-%m-%d %H:%M:%S', local_time)
|
1362
|
+
|
1363
|
+
# 读取PDF文件为二进制数据
|
1364
|
+
with open(os.path.join(path, name), 'rb') as file:
|
1365
|
+
pdf_data = file.read()
|
1366
|
+
dict_data = {
|
1367
|
+
'日期': datetime.datetime.today().strftime('%Y-%m-%d'),
|
1368
|
+
'数据来源': '微信',
|
1369
|
+
'文件名称': name,
|
1370
|
+
'文件大小': file_size,
|
1371
|
+
'修改时间': mod_time_formatted,
|
1372
|
+
'数据主体': pdf_data,
|
1373
|
+
'扩展名': os.path.splitext(name)[-1],
|
1374
|
+
'更新时间': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
1375
|
+
}
|
1376
|
+
set_typ = {
|
1377
|
+
'日期': 'date',
|
1378
|
+
'数据来源': 'varchar(100)',
|
1379
|
+
'文件名称': 'varchar(255)',
|
1380
|
+
'文件大小': 'varchar(20)',
|
1381
|
+
'修改时间': 'timestamp',
|
1382
|
+
'数据主体': 'longblob',
|
1383
|
+
'扩展名': 'varchar(50)',
|
1384
|
+
'更新时间': 'timestamp',
|
1385
|
+
}
|
1386
|
+
m_engine.doc_to_sql(
|
1387
|
+
db_name='pdf文件',
|
1388
|
+
table_name='微信pdf文件',
|
1389
|
+
remove_by_key=['文件名称'],
|
1390
|
+
dict_data=dict_data,
|
1391
|
+
set_typ=set_typ,
|
1392
|
+
allow_not_null=False,
|
1393
|
+
filename=name,
|
1394
|
+
reset_id=True,
|
1395
|
+
)
|
1396
|
+
if read_data:
|
1397
|
+
filename=''
|
1398
|
+
save_path = '/Users/xigua/Downloads'
|
1399
|
+
m_engine.read_doc_data(
|
1400
|
+
db_name='pdf文件',
|
1401
|
+
table_name='微信pdf文件',
|
1402
|
+
column='文件名称',
|
1403
|
+
filename=filename,
|
1404
|
+
save_path=save_path,
|
1405
|
+
)
|
1406
|
+
|
1327
1407
|
if __name__ == '__main__':
|
1408
|
+
doc_to_sql(
|
1409
|
+
write_data=True,
|
1410
|
+
read_data=False,
|
1411
|
+
)
|
1328
1412
|
# cut_as_year_month(as_month=False)
|
1329
1413
|
|
1330
1414
|
# username = 'root'
|
@@ -1332,12 +1416,12 @@ if __name__ == '__main__':
|
|
1332
1416
|
# host = ''
|
1333
1417
|
# port = ''
|
1334
1418
|
|
1335
|
-
# 上传 1 个文件到数据库
|
1336
|
-
one_file_to_mysql(
|
1337
|
-
|
1338
|
-
|
1339
|
-
|
1340
|
-
)
|
1419
|
+
# # 上传 1 个文件到数据库
|
1420
|
+
# one_file_to_mysql(
|
1421
|
+
# file=r'/Users/xigua/Downloads/日期表.csv',
|
1422
|
+
# db_name='聚合数据test',
|
1423
|
+
# table_name='日期表',
|
1424
|
+
# )
|
1341
1425
|
|
1342
1426
|
|
1343
1427
|
# col = 1
|
mdbq/aggregation/query_data.py
CHANGED
@@ -1895,6 +1895,7 @@ class MysqlDatasQuery:
|
|
1895
1895
|
start_date, end_date = self.months_data(num=self.months)
|
1896
1896
|
projection = {
|
1897
1897
|
'日期': 1,
|
1898
|
+
'场景id': 1,
|
1898
1899
|
'场景名字': 1,
|
1899
1900
|
'花费': 1,
|
1900
1901
|
'展现量': 1,
|
@@ -1918,10 +1919,10 @@ class MysqlDatasQuery:
|
|
1918
1919
|
if len(df_tm) > 0:
|
1919
1920
|
df_tm.rename(columns={'场景名字': '营销场景'}, inplace=True)
|
1920
1921
|
df_tm = df_tm.groupby(
|
1921
|
-
['日期', '店铺名称', '营销场景', '花费'],
|
1922
|
+
['日期', '店铺名称', '场景id', '营销场景', '花费', '展现量'],
|
1922
1923
|
as_index=False).agg(
|
1923
1924
|
**{
|
1924
|
-
'展现量': ('展现量', np.max),
|
1925
|
+
# '展现量': ('展现量', np.max),
|
1925
1926
|
'点击量': ('点击量', np.max),
|
1926
1927
|
'加购量': ('总购物车数', np.max),
|
1927
1928
|
'成交笔数': ('总成交笔数', np.max),
|
@@ -1942,10 +1943,10 @@ class MysqlDatasQuery:
|
|
1942
1943
|
if len(df_tb) > 0:
|
1943
1944
|
df_tb.rename(columns={'场景名字': '营销场景'}, inplace=True)
|
1944
1945
|
df_tb = df_tb.groupby(
|
1945
|
-
['日期', '店铺名称', '营销场景', '花费'],
|
1946
|
+
['日期', '店铺名称', '场景id', '营销场景', '花费', '展现量'],
|
1946
1947
|
as_index=False).agg(
|
1947
1948
|
**{
|
1948
|
-
'展现量': ('展现量', np.max),
|
1949
|
+
# '展现量': ('展现量', np.max),
|
1949
1950
|
'点击量': ('点击量', np.max),
|
1950
1951
|
'加购量': ('总购物车数', np.max),
|
1951
1952
|
'成交笔数': ('总成交笔数', np.max),
|
mdbq/mysql/mysql.py
CHANGED
@@ -4,6 +4,7 @@ import platform
|
|
4
4
|
import getpass
|
5
5
|
import re
|
6
6
|
import time
|
7
|
+
from fileinput import filename
|
7
8
|
from functools import wraps
|
8
9
|
import warnings
|
9
10
|
import pymysql
|
@@ -117,13 +118,203 @@ class MysqlUpload:
|
|
117
118
|
print(f'{func.__name__}, {e}') # 将异常信息返回
|
118
119
|
with open(error_file, 'a') as f:
|
119
120
|
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
120
|
-
f.write(f'\n{now}\n')
|
121
|
+
f.write(f'\n{now} \n')
|
121
122
|
# f.write(f'报错的文件:\n{e.__traceback__.tb_frame.f_globals["__file__"]}\n') # 发生异常所在的文件
|
122
123
|
traceback.print_exc(file=open(error_file, 'a')) # 返回完整的堆栈信息
|
123
124
|
print(f'更多信息请查看日志文件: {error_file}')
|
124
125
|
|
125
126
|
return wrapper
|
126
127
|
|
128
|
+
def cover_doc_dtypes(self, dict_data):
|
129
|
+
""" 清理字典键值 并转换数据类型 """
|
130
|
+
if not dict_data:
|
131
|
+
print(f'mysql.py -> MysqlUpload -> cover_dict_dtypes -> 传入的字典不能为空')
|
132
|
+
return
|
133
|
+
__res_dict = {}
|
134
|
+
new_dict_data = {}
|
135
|
+
for k, v in dict_data.items():
|
136
|
+
k = str(k).lower()
|
137
|
+
k = re.sub(r'[()\-,,$&~^、 ()\"\'“”=·/。》《><!!`]', '_', k, re.IGNORECASE)
|
138
|
+
k = k.replace(')', '')
|
139
|
+
k = re.sub(r'_{2,}', '_', k)
|
140
|
+
k = re.sub(r'_+$', '', k)
|
141
|
+
result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
|
142
|
+
result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
|
143
|
+
result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
|
144
|
+
result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
|
145
|
+
|
146
|
+
date_type = is_valid_date(v) # 判断日期时间
|
147
|
+
int_num = is_integer(v) # 判断整数
|
148
|
+
count_int, count_float = count_decimal_places(v) # 判断小数,返回小数位数
|
149
|
+
if result1: # 京东sku/spu商品信息
|
150
|
+
__res_dict.update({k: 'varchar(100)'})
|
151
|
+
elif k == '日期':
|
152
|
+
__res_dict.update({k: 'DATE'})
|
153
|
+
elif k == '更新时间':
|
154
|
+
__res_dict.update({k: 'TIMESTAMP'})
|
155
|
+
elif result2: # 小数
|
156
|
+
__res_dict.update({k: 'decimal(10,4)'})
|
157
|
+
elif date_type == 1: # 纯日期
|
158
|
+
__res_dict.update({k: 'DATE'})
|
159
|
+
elif date_type == 2: # 日期+时间
|
160
|
+
__res_dict.update({k: 'DATETIME'})
|
161
|
+
elif int_num:
|
162
|
+
__res_dict.update({k: 'INT'})
|
163
|
+
elif count_float > 0:
|
164
|
+
if count_int + count_float > 10:
|
165
|
+
# if count_float > 5:
|
166
|
+
# v = round(float(v), 4)
|
167
|
+
if count_float >= 6:
|
168
|
+
__res_dict.update({k: 'decimal(14,6)'})
|
169
|
+
else:
|
170
|
+
__res_dict.update({k: 'decimal(14,4)'})
|
171
|
+
elif count_float >= 6:
|
172
|
+
__res_dict.update({k: 'decimal(14,6)'})
|
173
|
+
elif count_float >= 4:
|
174
|
+
__res_dict.update({k: 'decimal(12,4)'})
|
175
|
+
else:
|
176
|
+
__res_dict.update({k: 'decimal(10,2)'})
|
177
|
+
else:
|
178
|
+
__res_dict.update({k: 'varchar(255)'})
|
179
|
+
new_dict_data.update({k: v})
|
180
|
+
__res_dict.update({'数据主体': 'longblob'})
|
181
|
+
return __res_dict, new_dict_data
|
182
|
+
|
183
|
+
# @try_except
|
184
|
+
def doc_to_sql(self, db_name, table_name, dict_data, set_typ={}, remove_by_key=None, allow_not_null=False, filename=None, reset_id=False):
|
185
|
+
"""
|
186
|
+
db_name:
|
187
|
+
table_name:
|
188
|
+
remove_by_key: 设置时先删除数据再插入,不设置则直接添加
|
189
|
+
dict_data:
|
190
|
+
set_typ:
|
191
|
+
allow_not_null:
|
192
|
+
filename:
|
193
|
+
reset_id:
|
194
|
+
"""
|
195
|
+
if '数据主体' not in dict_data.keys():
|
196
|
+
print(f'dict_data 中"数据主体"键不能为空')
|
197
|
+
return
|
198
|
+
|
199
|
+
connection = pymysql.connect(**self.config) # 连接数据库
|
200
|
+
with connection.cursor() as cursor:
|
201
|
+
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
202
|
+
database_exists = cursor.fetchone()
|
203
|
+
if not database_exists:
|
204
|
+
# 如果数据库不存在,则新建
|
205
|
+
if '8.138.27' in str(self.host) or platform.system() == "Linux": # 阿里云 mysql 低版本不支持 0900
|
206
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_unicode_ci"
|
207
|
+
self.config.update({'charset': 'utf8mb4_unicode_ci'})
|
208
|
+
if '192.168.1.100' in str(self.host):
|
209
|
+
sql = f"CREATE DATABASE `{db_name}`"
|
210
|
+
else:
|
211
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
212
|
+
cursor.execute(sql)
|
213
|
+
connection.commit()
|
214
|
+
print(f"创建Database: {db_name}")
|
215
|
+
|
216
|
+
self.config.update({'database': db_name}) # 添加更新 config 字段
|
217
|
+
connection = pymysql.connect(**self.config) # 重新连接数据库
|
218
|
+
with connection.cursor() as cursor:
|
219
|
+
# 1. 查询表, 不存在则创建一个空表
|
220
|
+
sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
|
221
|
+
cursor.execute(sql, (table_name))
|
222
|
+
if not cursor.fetchone():
|
223
|
+
sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
|
224
|
+
cursor.execute(sql)
|
225
|
+
print(f'创建 mysql 表: {table_name}')
|
226
|
+
|
227
|
+
new_dict = {}
|
228
|
+
[new_dict.update({k: v}) for k, v in dict_data.items() if k != '数据主体']
|
229
|
+
# 清理列名中的非法字符
|
230
|
+
dtypes, new_dict = self.cover_doc_dtypes(new_dict)
|
231
|
+
if set_typ:
|
232
|
+
# 更新自定义的列数据类型
|
233
|
+
for k, v in dtypes.items():
|
234
|
+
# 确保传进来的 set_typ 键存在于实际的 df 列才 update
|
235
|
+
[dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
|
236
|
+
|
237
|
+
# 检查列
|
238
|
+
sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
|
239
|
+
cursor.execute(sql, (db_name, table_name))
|
240
|
+
col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()] # 已存在的所有列
|
241
|
+
|
242
|
+
col_not_exist = [col for col in set_typ.keys() if col not in col_exist] # 不存在的列
|
243
|
+
# 不存在则新建列
|
244
|
+
if col_not_exist: # 数据表中不存在的列
|
245
|
+
for col in col_not_exist:
|
246
|
+
# 创建列,需转义
|
247
|
+
if allow_not_null:
|
248
|
+
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {set_typ[col]};"
|
249
|
+
else:
|
250
|
+
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {set_typ[col]} NOT NULL;"
|
251
|
+
cursor.execute(sql)
|
252
|
+
print(f"添加列: {col}({set_typ[col]})") # 添加列并指定数据类型
|
253
|
+
|
254
|
+
if col == '日期':
|
255
|
+
sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
|
256
|
+
print(f"设置为索引: {col}({set_typ[col]})")
|
257
|
+
cursor.execute(sql)
|
258
|
+
connection.commit() # 提交事务
|
259
|
+
|
260
|
+
if remove_by_key:
|
261
|
+
# 删除数据
|
262
|
+
se_key = ', '.join(remove_by_key)
|
263
|
+
condition = []
|
264
|
+
for up_col in remove_by_key:
|
265
|
+
condition += [f'`{up_col}` = "{dict_data[up_col]}"']
|
266
|
+
condition = ' AND '.join(condition)
|
267
|
+
# print(condition)
|
268
|
+
sql = f"SELECT {se_key} FROM `{table_name}` WHERE {condition}"
|
269
|
+
cursor.execute(sql)
|
270
|
+
result = cursor.fetchall()
|
271
|
+
if result:
|
272
|
+
sql = f'DELETE FROM `{table_name}` WHERE {condition};'
|
273
|
+
cursor.execute(sql)
|
274
|
+
|
275
|
+
# 插入数据到数据库
|
276
|
+
# 有数据格式错误问题,所以分开处理,将数据主体移到最后面用占位符
|
277
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
278
|
+
print(f'{now} 正在更新: mysql ({self.host}:{self.port}) {db_name}/{table_name} -> {filename}')
|
279
|
+
if new_dict:
|
280
|
+
cols = ', '.join(f"`{item}`" for item in new_dict.keys()) # 列名需要转义
|
281
|
+
values = ', '.join([f'"{item}"' for item in new_dict.values()]) # 值要加引号
|
282
|
+
cols = ', '.join([cols, '数据主体'])
|
283
|
+
binary_data = dict_data['数据主体']
|
284
|
+
sql = f"INSERT INTO `{table_name}` ({cols}) VALUES ({values}, %s)"
|
285
|
+
# print(sql)
|
286
|
+
cursor.execute(sql, binary_data)
|
287
|
+
else:
|
288
|
+
sql = f"""INSERT INTO `{table_name}` (数据主体) VALUES (%s);"""
|
289
|
+
cursor.execute(sql, dict_data['数据主体'])
|
290
|
+
|
291
|
+
if reset_id:
|
292
|
+
# 6. 重置自增列
|
293
|
+
try:
|
294
|
+
# 查询所有复合主键
|
295
|
+
sql = (
|
296
|
+
f"SELECT `COLUMN_NAME` AS `PrimaryKey` FROM `information_schema`.`COLUMNS` "
|
297
|
+
f"WHERE `TABLE_SCHEMA` = '{db_name}'AND `TABLE_NAME` = '{table_name}' AND `COLUMN_KEY` = 'PRI';"
|
298
|
+
)
|
299
|
+
cursor.execute(sql)
|
300
|
+
result = cursor.fetchall() # 复合主键数
|
301
|
+
if len(result) <= 1: # 如果存在复合主键,则不能直接删除 id 键,其他主键可能不是唯一,会报错
|
302
|
+
cursor.execute(f"SHOW COLUMNS FROM {table_name} LIKE 'id'")
|
303
|
+
result = cursor.fetchone()
|
304
|
+
if result:
|
305
|
+
cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN id;") # 删除 id 列
|
306
|
+
cursor.execute(
|
307
|
+
f"ALTER TABLE {table_name} ADD column id INT AUTO_INCREMENT PRIMARY KEY FIRST;")
|
308
|
+
cursor.execute(f"ALTER TABLE {table_name} AUTO_INCREMENT = 1") # 设置自增从 1 开始
|
309
|
+
# print(f'重置自增id')
|
310
|
+
else:
|
311
|
+
print(f'{table_name} 当前表存在复合主键: {result}, 无法重置自增id')
|
312
|
+
except Exception as e:
|
313
|
+
print(f'{e}')
|
314
|
+
connection.rollback()
|
315
|
+
connection.commit()
|
316
|
+
|
317
|
+
|
127
318
|
@try_except
|
128
319
|
def dict_to_mysql(self, db_name, table_name, dict_data, icm_update=None, main_key=None, unique_main_key=None, index_length=100, set_typ=None, allow_not_null=False, cut_data=None):
|
129
320
|
"""
|
@@ -188,7 +379,7 @@ class MysqlUpload:
|
|
188
379
|
print(f'创建 mysql 表: {table_name}')
|
189
380
|
|
190
381
|
# 根据 dict_data 的值添加指定的数据类型
|
191
|
-
dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': '
|
382
|
+
dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
|
192
383
|
if set_typ:
|
193
384
|
# 更新自定义的列数据类型
|
194
385
|
for k, v in dtypes.items():
|
@@ -574,9 +765,9 @@ class MysqlUpload:
|
|
574
765
|
connection.commit() # 提交事务
|
575
766
|
|
576
767
|
if df_sql:
|
577
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
768
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
578
769
|
print(
|
579
|
-
f'{now}正在更新: mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count}, {self.filename}')
|
770
|
+
f'{now} 正在更新: mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count}, {self.filename}')
|
580
771
|
engine = create_engine(
|
581
772
|
f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
|
582
773
|
# df.to_csv('/Users/xigua/Downloads/mysql.csv', index=False, header=True, encoding='utf-8_sig')
|
@@ -784,7 +975,51 @@ class MysqlUpload:
|
|
784
975
|
connection.commit() # 提交事务
|
785
976
|
connection.close()
|
786
977
|
|
787
|
-
|
978
|
+
@try_except
|
979
|
+
def read_doc_data(self, table_name, db_name='pdf文件', column='文件名', filename=None, save_path='/Users/xigua/Downloads'):
|
980
|
+
"""
|
981
|
+
db_name:
|
982
|
+
table_name:
|
983
|
+
column: 读取哪一列
|
984
|
+
filename: 文件名称
|
985
|
+
save_path: 保存位置
|
986
|
+
"""
|
987
|
+
if not filename:
|
988
|
+
print(f'未指定文件名: filename')
|
989
|
+
return
|
990
|
+
connection = pymysql.connect(**self.config) # 连接数据库
|
991
|
+
# try:
|
992
|
+
with connection.cursor() as cursor:
|
993
|
+
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
994
|
+
database_exists = cursor.fetchone()
|
995
|
+
if not database_exists:
|
996
|
+
print(f"Database {db_name} 数据库不存在")
|
997
|
+
return
|
998
|
+
self.config.update({'database': db_name})
|
999
|
+
connection = pymysql.connect(**self.config) # 重新连接数据库
|
1000
|
+
with connection.cursor() as cursor:
|
1001
|
+
# 1. 查询表
|
1002
|
+
sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
|
1003
|
+
cursor.execute(sql, (table_name))
|
1004
|
+
if not cursor.fetchone():
|
1005
|
+
print(f'{table_name} -> 数据表不存在')
|
1006
|
+
return
|
1007
|
+
|
1008
|
+
# 读取数据
|
1009
|
+
condition = f'`{column}` = "{filename}"'
|
1010
|
+
sql = f"SELECT `{column}`, `数据主体` FROM `{table_name}` WHERE {condition}"
|
1011
|
+
cursor.execute(sql)
|
1012
|
+
results = cursor.fetchall()
|
1013
|
+
if results:
|
1014
|
+
for result in results:
|
1015
|
+
# 将二进制数据写入到文件
|
1016
|
+
with open(os.path.join(save_path, filename), 'wb') as f:
|
1017
|
+
f.write(result['数据主体'])
|
1018
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1019
|
+
print(f'{now} 写入本地文件: ({self.host}:{self.port}) {db_name}/{table_name} -> {os.path.join(save_path, filename)}')
|
1020
|
+
connection.close()
|
1021
|
+
|
1022
|
+
|
788
1023
|
def read_mysql(self, table_name, start_date, end_date, db_name='远程数据源', date_name='日期'):
|
789
1024
|
""" 读取指定数据表,可指定日期范围,返回结果: df """
|
790
1025
|
start_date = pd.to_datetime(start_date).strftime('%Y-%m-%d')
|
@@ -800,8 +1035,8 @@ class MysqlUpload:
|
|
800
1035
|
print(f"Database {db_name} 数据库不存在")
|
801
1036
|
return df
|
802
1037
|
else:
|
803
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
804
|
-
print(f'{now}mysql 正在查询表: {table_name}, 范围: {start_date}~{end_date}')
|
1038
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1039
|
+
print(f'{now} mysql 正在查询表: {table_name}, 范围: {start_date}~{end_date}')
|
805
1040
|
except:
|
806
1041
|
return df
|
807
1042
|
finally:
|
@@ -828,11 +1063,11 @@ class MysqlUpload:
|
|
828
1063
|
if len(df) == 0:
|
829
1064
|
print(f'database: {db_name}, table: {table_name} 查询的数据为空')
|
830
1065
|
else:
|
831
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
1066
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
832
1067
|
cost_time = int(time.time() - before_time)
|
833
1068
|
if cost_time < 1:
|
834
1069
|
cost_time = round(time.time() - before_time, 2)
|
835
|
-
print(f'{now}mysql ({self.host}) 表: {table_name} 获取数据长度: {len(df)}, 用时: {cost_time} 秒')
|
1070
|
+
print(f'{now} mysql ({self.host}) 表: {table_name} 获取数据长度: {len(df)}, 用时: {cost_time} 秒')
|
836
1071
|
return df
|
837
1072
|
|
838
1073
|
def upload_pandas(self, update_path, db_name, days=None):
|
@@ -860,8 +1095,8 @@ class MysqlUpload:
|
|
860
1095
|
if name.endswith('.csv') and 'baidu' not in name:
|
861
1096
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
862
1097
|
# if '日期' not in df.columns.tolist():
|
863
|
-
# now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
864
|
-
# print(f'{now}{root_file} 缺少日期列, 不支持上传 mysql')
|
1098
|
+
# now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1099
|
+
# print(f'{now} {root_file} 缺少日期列, 不支持上传 mysql')
|
865
1100
|
# continue
|
866
1101
|
if '日期' in df.columns.tolist():
|
867
1102
|
df['日期'] = df['日期'].apply(lambda x: pd.to_datetime(x) if x else x)
|
@@ -873,8 +1108,8 @@ class MysqlUpload:
|
|
873
1108
|
if f_path.endswith('.csv') and 'baidu' not in f_path:
|
874
1109
|
df = pd.read_csv(f_path, encoding='utf-8_sig', header=0, na_filter=False)
|
875
1110
|
# if '日期' not in df.columns.tolist():
|
876
|
-
# now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
877
|
-
# print(f'{now}{root_file} 缺少日期列, 不支持上传 mysql')
|
1111
|
+
# now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1112
|
+
# print(f'{now} {root_file} 缺少日期列, 不支持上传 mysql')
|
878
1113
|
# continue
|
879
1114
|
if '日期' not in df.columns.tolist():
|
880
1115
|
df['日期'] = df['日期'].apply(lambda x: pd.to_datetime(x) if x else x)
|
@@ -925,7 +1160,7 @@ class OptimizeDatas:
|
|
925
1160
|
print(f'{func.__name__}, {e}') # 将异常信息返回
|
926
1161
|
with open(error_file, 'a') as f:
|
927
1162
|
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
928
|
-
f.write(f'\n{now}\n')
|
1163
|
+
f.write(f'\n{now} \n')
|
929
1164
|
# f.write(f'报错的文件:\n{e.__traceback__.tb_frame.f_globals["__file__"]}\n') # 发生异常所在的文件
|
930
1165
|
traceback.print_exc(file=open(error_file, 'a')) # 返回完整的堆栈信息
|
931
1166
|
print(f'更多信息请查看日志文件: {error_file}')
|
@@ -938,8 +1173,8 @@ class OptimizeDatas:
|
|
938
1173
|
需要设置 self.db_name_lists
|
939
1174
|
"""
|
940
1175
|
if not self.db_name_lists:
|
941
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
942
|
-
print(f'{now}尚未设置参数: self.db_name_lists')
|
1176
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1177
|
+
print(f'{now} 尚未设置参数: self.db_name_lists')
|
943
1178
|
return
|
944
1179
|
for db_name in self.db_name_lists:
|
945
1180
|
self.db_name = db_name
|
@@ -948,13 +1183,13 @@ class OptimizeDatas:
|
|
948
1183
|
def optimize(self, except_key=['更新时间']):
|
949
1184
|
""" 更新一个数据库 移除冗余数据 """
|
950
1185
|
if not self.db_name:
|
951
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
952
|
-
print(f'{now}尚未设置参数: self.db_name')
|
1186
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1187
|
+
print(f'{now} 尚未设置参数: self.db_name')
|
953
1188
|
return
|
954
1189
|
tables = self.table_list(db_name=self.db_name)
|
955
1190
|
if not tables:
|
956
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
957
|
-
print(f'{now}{self.db_name} -> 数据表不存在')
|
1191
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1192
|
+
print(f'{now} {self.db_name} -> 数据表不存在')
|
958
1193
|
return
|
959
1194
|
|
960
1195
|
# 日期初始化
|
@@ -971,8 +1206,8 @@ class OptimizeDatas:
|
|
971
1206
|
start_date_before = self.start_date
|
972
1207
|
end_date_before = self.end_date
|
973
1208
|
|
974
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
975
|
-
print(f'{now}mysql({self.host}: {self.port}) {self.db_name} 数据库优化中(日期长度: {self.days} 天)...')
|
1209
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1210
|
+
print(f'{now} mysql({self.host}: {self.port}) {self.db_name} 数据库优化中(日期长度: {self.days} 天)...')
|
976
1211
|
for table_dict in tables:
|
977
1212
|
for key, table_name in table_dict.items():
|
978
1213
|
# if '店铺指标' not in table_name:
|
@@ -985,8 +1220,8 @@ class OptimizeDatas:
|
|
985
1220
|
cursor.execute(sql)
|
986
1221
|
result = cursor.fetchone()
|
987
1222
|
if not result:
|
988
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
989
|
-
print(f'{now}数据表: {table_name}, 数据长度为 0')
|
1223
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1224
|
+
print(f'{now} 数据表: {table_name}, 数据长度为 0')
|
990
1225
|
continue # 检查数据表是否为空
|
991
1226
|
|
992
1227
|
cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`") # 查询数据表的列信息
|
@@ -1042,8 +1277,8 @@ class OptimizeDatas:
|
|
1042
1277
|
print(f'{e}')
|
1043
1278
|
self.connection.rollback()
|
1044
1279
|
self.connection.close()
|
1045
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
1046
|
-
print(f'{now}mysql({self.host}: {self.port}) {self.db_name} 数据库优化完成!')
|
1280
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1281
|
+
print(f'{now} mysql({self.host}: {self.port}) {self.db_name} 数据库优化完成!')
|
1047
1282
|
|
1048
1283
|
def delete_duplicate(self, table_name, date, except_key=['更新时间']):
|
1049
1284
|
datas = self.table_datas(db_name=self.db_name, table_name=str(table_name), date=date)
|
@@ -1076,8 +1311,8 @@ class OptimizeDatas:
|
|
1076
1311
|
# 移除冗余数据
|
1077
1312
|
sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
|
1078
1313
|
cursor.execute(sql, duplicate_id)
|
1079
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
1080
|
-
print(f"{now}{table_name} -> {date.strftime('%Y-%m-%d')} before: {len(datas)}, remove: {cursor.rowcount}")
|
1314
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1315
|
+
print(f"{now} {table_name} -> {date.strftime('%Y-%m-%d')} before: {len(datas)}, remove: {cursor.rowcount}")
|
1081
1316
|
self.connection.commit() # 提交事务
|
1082
1317
|
except Exception as e:
|
1083
1318
|
print(f'{self.db_name}/{table_name}, {e}')
|
@@ -1114,8 +1349,8 @@ class OptimizeDatas:
|
|
1114
1349
|
# 移除冗余数据
|
1115
1350
|
sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
|
1116
1351
|
cursor.execute(sql, duplicate_id)
|
1117
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
1118
|
-
print(f"{now}{table_name} -> before: {len(datas)}, "
|
1352
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1353
|
+
print(f"{now} {table_name} -> before: {len(datas)}, "
|
1119
1354
|
f"remove: {cursor.rowcount}")
|
1120
1355
|
self.connection.commit() # 提交事务
|
1121
1356
|
except Exception as e:
|
@@ -1139,8 +1374,8 @@ class OptimizeDatas:
|
|
1139
1374
|
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
1140
1375
|
database_exists = cursor.fetchone()
|
1141
1376
|
if not database_exists:
|
1142
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
1143
|
-
print(f'{now}{db_name}: 数据表不存在!')
|
1377
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1378
|
+
print(f'{now} {db_name}: 数据表不存在!')
|
1144
1379
|
return
|
1145
1380
|
except Exception as e:
|
1146
1381
|
print(f'002 {e}')
|
@@ -1,11 +1,11 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
2
|
mdbq/__version__.py,sha256=y9Mp_8x0BCZSHsdLT_q5tX9wZwd5QgqrSIENLrb6vXA,62
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/aggregation.py,sha256=
|
4
|
+
mdbq/aggregation/aggregation.py,sha256=cVp7MLFOSOAtfuCqjZYW7S3mEdw2Gc_jEdqCeWz7qh0,80264
|
5
5
|
mdbq/aggregation/df_types.py,sha256=U9i3q2eRPTDY8qAPTw7irzu-Tlg4CIySW9uYro81wdk,8125
|
6
6
|
mdbq/aggregation/mysql_types.py,sha256=YTGyrF9vcRgfkQbpT-e-JdJ7c7VF1dDHgyx9YZRES8w,10934
|
7
7
|
mdbq/aggregation/optimize_data.py,sha256=RXIv7cACCgYyehAxMjUYi_S7rVyjIwXKWMaM3nduGtA,3068
|
8
|
-
mdbq/aggregation/query_data.py,sha256=
|
8
|
+
mdbq/aggregation/query_data.py,sha256=2--y1VNYhL7lCeVA9WjIHiz3K_2JYm9agFqWd5jaeIc,148341
|
9
9
|
mdbq/aggregation/query_data_bak.py,sha256=r1FU0C4zjXln7oVSrRkElh4Ehl-9mYhGcq57jLbViUA,104071
|
10
10
|
mdbq/aggregation/query_data_bak20241124.py,sha256=oY95ZK3qt3Wx9pdZKZ5cvDh45Yi5yGj1kl8G6riumHA,144513
|
11
11
|
mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
|
@@ -28,7 +28,7 @@ mdbq/log/mylogger.py,sha256=oaT7Bp-Hb9jZt52seP3ISUuxVcI19s4UiqTeouScBO0,3258
|
|
28
28
|
mdbq/mongo/__init__.py,sha256=SILt7xMtQIQl_m-ik9WLtJSXIVf424iYgCfE_tnQFbw,13
|
29
29
|
mdbq/mongo/mongo.py,sha256=v9qvrp6p1ZRWuPpbSilqveiE0FEcZF7U5xUPI0RN4xs,31880
|
30
30
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
31
|
-
mdbq/mysql/mysql.py,sha256
|
31
|
+
mdbq/mysql/mysql.py,sha256=ZG6BMfoXg6YGnHqv7GfwPwd7RXjoetCAFqPnbdHWqOM,79507
|
32
32
|
mdbq/mysql/recheck_mysql.py,sha256=rgTpvDMWYTyEn7UQdlig-pdXDluTgiU8JG6lkMh8DV0,8665
|
33
33
|
mdbq/mysql/s_query.py,sha256=MbIprZ4yJDAZ9AahZPzl7hqS695Vs0P-AJNwAtA_EEc,9287
|
34
34
|
mdbq/mysql/year_month_day.py,sha256=VgewoE2pJxK7ErjfviL_SMTN77ki8GVbTUcao3vFUCE,1523
|
@@ -46,7 +46,7 @@ mdbq/req_post/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
|
46
46
|
mdbq/req_post/req_tb.py,sha256=qg7pet73IgKGmCwxaeUyImJIoeK_pBQT9BBKD7fkBNg,36160
|
47
47
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
48
48
|
mdbq/spider/aikucun.py,sha256=nIKKZOZbemKqcrikcrMmtksLgJjjzeU0I99teBgU1jE,22439
|
49
|
-
mdbq-3.2.
|
50
|
-
mdbq-3.2.
|
51
|
-
mdbq-3.2.
|
52
|
-
mdbq-3.2.
|
49
|
+
mdbq-3.2.12.dist-info/METADATA,sha256=W62uxvamVOW_S6O91kqwl5N36Nh8QzvKHF-C5ZyiD-w,244
|
50
|
+
mdbq-3.2.12.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
51
|
+
mdbq-3.2.12.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
52
|
+
mdbq-3.2.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|