mdbq 3.2.10__py3-none-any.whl → 3.2.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/aggregation.py +113 -18
- mdbq/aggregation/query_data.py +31 -32
- mdbq/mysql/mysql.py +365 -80
- {mdbq-3.2.10.dist-info → mdbq-3.2.12.dist-info}/METADATA +1 -1
- {mdbq-3.2.10.dist-info → mdbq-3.2.12.dist-info}/RECORD +7 -7
- {mdbq-3.2.10.dist-info → mdbq-3.2.12.dist-info}/WHEEL +0 -0
- {mdbq-3.2.10.dist-info → mdbq-3.2.12.dist-info}/top_level.txt +0 -0
mdbq/aggregation/aggregation.py
CHANGED
@@ -52,6 +52,16 @@ if not username:
|
|
52
52
|
print(f'找不到主机:')
|
53
53
|
|
54
54
|
|
55
|
+
def get_encoding(path):
|
56
|
+
"""
|
57
|
+
获取文件的编码方式, 读取速度比较慢,非必要不要使用
|
58
|
+
"""
|
59
|
+
with open(path, 'rb') as f:
|
60
|
+
f1 = f.read()
|
61
|
+
encod = chardet.detect(f1).get('encoding')
|
62
|
+
return encod
|
63
|
+
|
64
|
+
|
55
65
|
class DatabaseUpdateBak:
|
56
66
|
"""
|
57
67
|
清洗文件,并入库,被 tg.py 调用
|
@@ -1236,20 +1246,22 @@ def one_file_to_mysql(file, db_name, table_name):
|
|
1236
1246
|
if file.endswith('.xlsx'):
|
1237
1247
|
df = pd.read_excel(file)
|
1238
1248
|
else:
|
1239
|
-
|
1249
|
+
encod = get_encoding(file)
|
1250
|
+
df = pd.read_csv(file, encoding=encod, header=0, na_filter=False, float_precision='high')
|
1240
1251
|
# df.replace(to_replace=[','], value='', regex=True, inplace=True) # 替换掉特殊字符
|
1241
1252
|
m = mysql.MysqlUpload(username=username, password=password, host=host, port=port)
|
1253
|
+
# df.pop('id')
|
1242
1254
|
m.df_to_mysql(
|
1243
1255
|
df=df,
|
1244
1256
|
db_name=db_name,
|
1245
1257
|
table_name=table_name,
|
1246
1258
|
# icm_update=['sku_id'], # 增量更新, 在聚合数据中使用,其他不要用
|
1247
|
-
move_insert=
|
1259
|
+
move_insert=True, # 先删除,再插入
|
1248
1260
|
df_sql=False, # 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重
|
1249
1261
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
1250
1262
|
count=None,
|
1251
1263
|
filename=None, # 用来追踪处理进度
|
1252
|
-
|
1264
|
+
reset_id=True, # 是否重置自增列
|
1253
1265
|
# set_typ=set_typ,
|
1254
1266
|
)
|
1255
1267
|
|
@@ -1312,32 +1324,115 @@ def cut_as_year_month(as_month=False):
|
|
1312
1324
|
df.to_csv(os.path.join(root, new_name), encoding='utf-8_sig', index=False, header=True)
|
1313
1325
|
|
1314
1326
|
|
1327
|
+
def doc_to_sql(write_data=False, read_data=False):
|
1328
|
+
if not write_data and not read_data:
|
1329
|
+
return
|
1330
|
+
# filename = '关于做好2024年世界互联网大会乌镇峰会期间寄递渠道安全保障工作的通知.pdf'
|
1331
|
+
path = '/Users/xigua/数据中心/微信pdf文件/2024-10'
|
1332
|
+
|
1333
|
+
if not os.path.isdir(path):
|
1334
|
+
print(f'不存在的文件夹: {path}')
|
1335
|
+
return
|
1336
|
+
m_engine = mysql.MysqlUpload(
|
1337
|
+
username=username,
|
1338
|
+
password=password,
|
1339
|
+
host=host,
|
1340
|
+
port=port,
|
1341
|
+
charset='utf8mb4'
|
1342
|
+
)
|
1343
|
+
if write_data:
|
1344
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
1345
|
+
for name in files:
|
1346
|
+
if '~$' in name or '.DS' in name or '.localized' in name or 'baidu' in name:
|
1347
|
+
continue
|
1348
|
+
if name.endswith('.pdf') or name.endswith('.pptx'):
|
1349
|
+
file_size = os.stat(os.path.join(root, name)).st_size
|
1350
|
+
if file_size > 1024 * 1024 * 1024:
|
1351
|
+
file_size = file_size / 1024 / 1024 / 1024
|
1352
|
+
file_size = f'{file_size:.2f} GB'
|
1353
|
+
elif file_size > 1024 * 1024:
|
1354
|
+
file_size = file_size / 1024 / 1024
|
1355
|
+
file_size = f'{file_size:.2f} MB'
|
1356
|
+
else:
|
1357
|
+
file_size = file_size / 1024
|
1358
|
+
file_size = f'{file_size:.2f} KB'
|
1359
|
+
mod_time = os.path.getmtime(os.path.join(root, name))
|
1360
|
+
local_time = time.localtime(mod_time)
|
1361
|
+
mod_time_formatted = time.strftime('%Y-%m-%d %H:%M:%S', local_time)
|
1362
|
+
|
1363
|
+
# 读取PDF文件为二进制数据
|
1364
|
+
with open(os.path.join(path, name), 'rb') as file:
|
1365
|
+
pdf_data = file.read()
|
1366
|
+
dict_data = {
|
1367
|
+
'日期': datetime.datetime.today().strftime('%Y-%m-%d'),
|
1368
|
+
'数据来源': '微信',
|
1369
|
+
'文件名称': name,
|
1370
|
+
'文件大小': file_size,
|
1371
|
+
'修改时间': mod_time_formatted,
|
1372
|
+
'数据主体': pdf_data,
|
1373
|
+
'扩展名': os.path.splitext(name)[-1],
|
1374
|
+
'更新时间': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
1375
|
+
}
|
1376
|
+
set_typ = {
|
1377
|
+
'日期': 'date',
|
1378
|
+
'数据来源': 'varchar(100)',
|
1379
|
+
'文件名称': 'varchar(255)',
|
1380
|
+
'文件大小': 'varchar(20)',
|
1381
|
+
'修改时间': 'timestamp',
|
1382
|
+
'数据主体': 'longblob',
|
1383
|
+
'扩展名': 'varchar(50)',
|
1384
|
+
'更新时间': 'timestamp',
|
1385
|
+
}
|
1386
|
+
m_engine.doc_to_sql(
|
1387
|
+
db_name='pdf文件',
|
1388
|
+
table_name='微信pdf文件',
|
1389
|
+
remove_by_key=['文件名称'],
|
1390
|
+
dict_data=dict_data,
|
1391
|
+
set_typ=set_typ,
|
1392
|
+
allow_not_null=False,
|
1393
|
+
filename=name,
|
1394
|
+
reset_id=True,
|
1395
|
+
)
|
1396
|
+
if read_data:
|
1397
|
+
filename=''
|
1398
|
+
save_path = '/Users/xigua/Downloads'
|
1399
|
+
m_engine.read_doc_data(
|
1400
|
+
db_name='pdf文件',
|
1401
|
+
table_name='微信pdf文件',
|
1402
|
+
column='文件名称',
|
1403
|
+
filename=filename,
|
1404
|
+
save_path=save_path,
|
1405
|
+
)
|
1406
|
+
|
1315
1407
|
if __name__ == '__main__':
|
1408
|
+
doc_to_sql(
|
1409
|
+
write_data=True,
|
1410
|
+
read_data=False,
|
1411
|
+
)
|
1316
1412
|
# cut_as_year_month(as_month=False)
|
1317
1413
|
|
1318
1414
|
# username = 'root'
|
1319
1415
|
# password = ''
|
1320
1416
|
# host = ''
|
1321
1417
|
# port = ''
|
1322
|
-
|
1418
|
+
|
1323
1419
|
# # 上传 1 个文件到数据库
|
1324
1420
|
# one_file_to_mysql(
|
1325
|
-
# file=r'/Users/xigua/Downloads
|
1326
|
-
# db_name='
|
1327
|
-
# table_name='
|
1421
|
+
# file=r'/Users/xigua/Downloads/日期表.csv',
|
1422
|
+
# db_name='聚合数据test',
|
1423
|
+
# table_name='日期表',
|
1328
1424
|
# )
|
1329
1425
|
|
1330
1426
|
|
1331
|
-
col = 1
|
1332
|
-
if col:
|
1333
|
-
|
1334
|
-
|
1335
|
-
|
1336
|
-
|
1337
|
-
|
1338
|
-
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1427
|
+
# col = 1
|
1428
|
+
# if col:
|
1429
|
+
# # 上传一个目录到指定数据库
|
1430
|
+
# db_name = '爱库存2'
|
1431
|
+
# table_name = '商品spu榜单'
|
1432
|
+
# upload_dir(
|
1433
|
+
# path=r'/Users/xigua/Downloads/数据上传中心',
|
1434
|
+
# db_name=db_name,
|
1435
|
+
# collection_name=table_name,
|
1436
|
+
# )
|
1342
1437
|
|
1343
1438
|
|
mdbq/aggregation/query_data.py
CHANGED
@@ -59,6 +59,7 @@ class MysqlDatasQuery:
|
|
59
59
|
self.update_service = True # 调试时加,true: 将数据写入 mysql 服务器
|
60
60
|
self.pf_datas = []
|
61
61
|
self.pf_datas_jd = [] # 京东聚合销售表
|
62
|
+
self.output = set_support.SetSupport(dirname='support')
|
62
63
|
|
63
64
|
@staticmethod
|
64
65
|
def try_except(func): # 在类内部定义一个异常处理方法
|
@@ -196,9 +197,8 @@ class MysqlDatasQuery:
|
|
196
197
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
197
198
|
count=None,
|
198
199
|
filename=None, # 用来追踪处理进度
|
199
|
-
reset_id=
|
200
|
+
reset_id=True, # 是否重置自增列
|
200
201
|
set_typ=set_typ,
|
201
|
-
|
202
202
|
)
|
203
203
|
|
204
204
|
# df_pic:商品排序索引表, 给 powerbi 中的主推款排序用的,(从上月1号到今天的总花费进行排序)
|
@@ -337,7 +337,7 @@ class MysqlDatasQuery:
|
|
337
337
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
338
338
|
count=None,
|
339
339
|
filename=None, # 用来追踪处理进度
|
340
|
-
reset_id=
|
340
|
+
reset_id=True, # 是否重置自增列
|
341
341
|
set_typ=set_typ,
|
342
342
|
)
|
343
343
|
return True
|
@@ -477,7 +477,7 @@ class MysqlDatasQuery:
|
|
477
477
|
df['人群分类'].fillna('', inplace=True)
|
478
478
|
if '人群分类' in df.columns.tolist():
|
479
479
|
# 这行决定了,从文件中读取的分类信息优先级高于内部函数的分类规则
|
480
|
-
# 这个 lambda
|
480
|
+
# 这个 lambda 适配人群名字中带有特定标识的分类,强匹配,自定义命名
|
481
481
|
df['人群分类'] = df.apply(
|
482
482
|
lambda x: self.set_crowd(keyword=str(x['人群名字']), as_file=False) if x['人群分类'] == ''
|
483
483
|
else x['人群分类'], axis=1
|
@@ -527,7 +527,7 @@ class MysqlDatasQuery:
|
|
527
527
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
528
528
|
count=None,
|
529
529
|
filename=None, # 用来追踪处理进度
|
530
|
-
reset_id=
|
530
|
+
reset_id=True, # 是否重置自增列
|
531
531
|
set_typ=set_typ,
|
532
532
|
)
|
533
533
|
return True
|
@@ -663,7 +663,7 @@ class MysqlDatasQuery:
|
|
663
663
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
664
664
|
count=None,
|
665
665
|
filename=None, # 用来追踪处理进度
|
666
|
-
reset_id=
|
666
|
+
reset_id=True, # 是否重置自增列
|
667
667
|
set_typ=set_typ,
|
668
668
|
)
|
669
669
|
return True
|
@@ -787,7 +787,7 @@ class MysqlDatasQuery:
|
|
787
787
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
788
788
|
count=None,
|
789
789
|
filename=None, # 用来追踪处理进度
|
790
|
-
reset_id=
|
790
|
+
reset_id=True, # 是否重置自增列
|
791
791
|
set_typ=set_typ,
|
792
792
|
)
|
793
793
|
return True
|
@@ -900,7 +900,7 @@ class MysqlDatasQuery:
|
|
900
900
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
901
901
|
count=None,
|
902
902
|
filename=None, # 用来追踪处理进度
|
903
|
-
reset_id=
|
903
|
+
reset_id=True, # 是否重置自增列
|
904
904
|
set_typ=set_typ,
|
905
905
|
)
|
906
906
|
return True
|
@@ -957,7 +957,7 @@ class MysqlDatasQuery:
|
|
957
957
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
958
958
|
count=None,
|
959
959
|
filename=None, # 用来追踪处理进度
|
960
|
-
reset_id=
|
960
|
+
reset_id=True, # 是否重置自增列
|
961
961
|
set_typ=set_typ,
|
962
962
|
)
|
963
963
|
return True
|
@@ -1016,7 +1016,6 @@ class MysqlDatasQuery:
|
|
1016
1016
|
filename=None, # 用来追踪处理进度
|
1017
1017
|
reset_id=False, # 是否重置自增列
|
1018
1018
|
set_typ=set_typ,
|
1019
|
-
|
1020
1019
|
)
|
1021
1020
|
return True
|
1022
1021
|
|
@@ -1123,7 +1122,7 @@ class MysqlDatasQuery:
|
|
1123
1122
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
1124
1123
|
count=None,
|
1125
1124
|
filename=None, # 用来追踪处理进度
|
1126
|
-
reset_id=
|
1125
|
+
reset_id=True, # 是否重置自增列
|
1127
1126
|
set_typ=set_typ,
|
1128
1127
|
)
|
1129
1128
|
return True
|
@@ -1172,7 +1171,6 @@ class MysqlDatasQuery:
|
|
1172
1171
|
filename=None, # 用来追踪处理进度
|
1173
1172
|
reset_id=False, # 是否重置自增列
|
1174
1173
|
set_typ=set_typ,
|
1175
|
-
|
1176
1174
|
)
|
1177
1175
|
return True
|
1178
1176
|
|
@@ -1276,7 +1274,7 @@ class MysqlDatasQuery:
|
|
1276
1274
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
1277
1275
|
count=None,
|
1278
1276
|
filename=None, # 用来追踪处理进度
|
1279
|
-
reset_id=
|
1277
|
+
reset_id=True, # 是否重置自增列
|
1280
1278
|
set_typ=set_typ,
|
1281
1279
|
|
1282
1280
|
)
|
@@ -1311,7 +1309,7 @@ class MysqlDatasQuery:
|
|
1311
1309
|
# drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
1312
1310
|
# count=None,
|
1313
1311
|
# filename=None, # 用来追踪处理进度
|
1314
|
-
# reset_id=
|
1312
|
+
# reset_id=True, # 是否重置自增列
|
1315
1313
|
# set_typ=set_typ,
|
1316
1314
|
#
|
1317
1315
|
# )
|
@@ -1380,7 +1378,7 @@ class MysqlDatasQuery:
|
|
1380
1378
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
1381
1379
|
count=None,
|
1382
1380
|
filename=None, # 用来追踪处理进度
|
1383
|
-
reset_id=
|
1381
|
+
reset_id=True, # 是否重置自增列
|
1384
1382
|
set_typ=set_typ,
|
1385
1383
|
|
1386
1384
|
)
|
@@ -1482,7 +1480,7 @@ class MysqlDatasQuery:
|
|
1482
1480
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
1483
1481
|
count=None,
|
1484
1482
|
filename=None, # 用来追踪处理进度
|
1485
|
-
reset_id=
|
1483
|
+
reset_id=True, # 是否重置自增列
|
1486
1484
|
set_typ=set_typ,
|
1487
1485
|
|
1488
1486
|
)
|
@@ -1557,7 +1555,7 @@ class MysqlDatasQuery:
|
|
1557
1555
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
1558
1556
|
count=None,
|
1559
1557
|
filename=None, # 用来追踪处理进度
|
1560
|
-
reset_id=
|
1558
|
+
reset_id=True, # 是否重置自增列
|
1561
1559
|
set_typ=set_typ,
|
1562
1560
|
)
|
1563
1561
|
return True
|
@@ -1623,7 +1621,7 @@ class MysqlDatasQuery:
|
|
1623
1621
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
1624
1622
|
count=None,
|
1625
1623
|
filename=None, # 用来追踪处理进度
|
1626
|
-
reset_id=
|
1624
|
+
reset_id=True, # 是否重置自增列
|
1627
1625
|
set_typ=set_typ,
|
1628
1626
|
|
1629
1627
|
)
|
@@ -1707,9 +1705,8 @@ class MysqlDatasQuery:
|
|
1707
1705
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
1708
1706
|
count=None,
|
1709
1707
|
filename=None, # 用来追踪处理进度
|
1710
|
-
reset_id=
|
1708
|
+
reset_id=True, # 是否重置自增列
|
1711
1709
|
set_typ=set_typ,
|
1712
|
-
|
1713
1710
|
)
|
1714
1711
|
return True
|
1715
1712
|
|
@@ -1898,6 +1895,7 @@ class MysqlDatasQuery:
|
|
1898
1895
|
start_date, end_date = self.months_data(num=self.months)
|
1899
1896
|
projection = {
|
1900
1897
|
'日期': 1,
|
1898
|
+
'场景id': 1,
|
1901
1899
|
'场景名字': 1,
|
1902
1900
|
'花费': 1,
|
1903
1901
|
'展现量': 1,
|
@@ -1921,10 +1919,10 @@ class MysqlDatasQuery:
|
|
1921
1919
|
if len(df_tm) > 0:
|
1922
1920
|
df_tm.rename(columns={'场景名字': '营销场景'}, inplace=True)
|
1923
1921
|
df_tm = df_tm.groupby(
|
1924
|
-
['日期', '店铺名称', '营销场景', '花费'],
|
1922
|
+
['日期', '店铺名称', '场景id', '营销场景', '花费', '展现量'],
|
1925
1923
|
as_index=False).agg(
|
1926
1924
|
**{
|
1927
|
-
'展现量': ('展现量', np.max),
|
1925
|
+
# '展现量': ('展现量', np.max),
|
1928
1926
|
'点击量': ('点击量', np.max),
|
1929
1927
|
'加购量': ('总购物车数', np.max),
|
1930
1928
|
'成交笔数': ('总成交笔数', np.max),
|
@@ -1945,10 +1943,10 @@ class MysqlDatasQuery:
|
|
1945
1943
|
if len(df_tb) > 0:
|
1946
1944
|
df_tb.rename(columns={'场景名字': '营销场景'}, inplace=True)
|
1947
1945
|
df_tb = df_tb.groupby(
|
1948
|
-
['日期', '店铺名称', '营销场景', '花费'],
|
1946
|
+
['日期', '店铺名称', '场景id', '营销场景', '花费', '展现量'],
|
1949
1947
|
as_index=False).agg(
|
1950
1948
|
**{
|
1951
|
-
'展现量': ('展现量', np.max),
|
1949
|
+
# '展现量': ('展现量', np.max),
|
1952
1950
|
'点击量': ('点击量', np.max),
|
1953
1951
|
'加购量': ('总购物车数', np.max),
|
1954
1952
|
'成交笔数': ('总成交笔数', np.max),
|
@@ -2207,7 +2205,7 @@ class MysqlDatasQuery:
|
|
2207
2205
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
2208
2206
|
count=None,
|
2209
2207
|
filename=None, # 用来追踪处理进度
|
2210
|
-
reset_id=
|
2208
|
+
reset_id=True, # 是否重置自增列
|
2211
2209
|
set_typ=set_typ,
|
2212
2210
|
|
2213
2211
|
)
|
@@ -2324,7 +2322,7 @@ class MysqlDatasQuery:
|
|
2324
2322
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
2325
2323
|
count=None,
|
2326
2324
|
filename=None, # 用来追踪处理进度
|
2327
|
-
reset_id=
|
2325
|
+
reset_id=True, # 是否重置自增列
|
2328
2326
|
set_typ=set_typ,
|
2329
2327
|
|
2330
2328
|
)
|
@@ -2377,7 +2375,7 @@ class MysqlDatasQuery:
|
|
2377
2375
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
2378
2376
|
count=None,
|
2379
2377
|
filename=None, # 用来追踪处理进度
|
2380
|
-
reset_id=
|
2378
|
+
reset_id=True, # 是否重置自增列
|
2381
2379
|
set_typ=set_typ,
|
2382
2380
|
|
2383
2381
|
)
|
@@ -2490,7 +2488,7 @@ class MysqlDatasQuery:
|
|
2490
2488
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
2491
2489
|
count=None,
|
2492
2490
|
filename=None, # 用来追踪处理进度
|
2493
|
-
reset_id=
|
2491
|
+
reset_id=True, # 是否重置自增列
|
2494
2492
|
set_typ=set_typ,
|
2495
2493
|
)
|
2496
2494
|
return True
|
@@ -2739,6 +2737,7 @@ class MysqlDatasQuery:
|
|
2739
2737
|
'机会',
|
2740
2738
|
'推荐',
|
2741
2739
|
'智能定向',
|
2740
|
+
'AI',
|
2742
2741
|
]
|
2743
2742
|
},
|
2744
2743
|
{
|
@@ -2900,7 +2899,7 @@ class MysqlDatasQuery:
|
|
2900
2899
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
2901
2900
|
count=None,
|
2902
2901
|
filename=None, # 用来追踪处理进度
|
2903
|
-
reset_id=
|
2902
|
+
reset_id=True, # 是否重置自增列
|
2904
2903
|
set_typ=set_typ,
|
2905
2904
|
)
|
2906
2905
|
return True
|
@@ -3010,7 +3009,7 @@ class MysqlDatasQuery:
|
|
3010
3009
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
3011
3010
|
count=None,
|
3012
3011
|
filename=None, # 用来追踪处理进度
|
3013
|
-
reset_id=
|
3012
|
+
reset_id=True, # 是否重置自增列
|
3014
3013
|
set_typ=set_typ,
|
3015
3014
|
)
|
3016
3015
|
return True
|
@@ -3081,7 +3080,7 @@ class MysqlDatasQuery:
|
|
3081
3080
|
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
3082
3081
|
count=None,
|
3083
3082
|
filename=None, # 用来追踪处理进度
|
3084
|
-
reset_id=
|
3083
|
+
reset_id=True, # 是否重置自增列
|
3085
3084
|
set_typ=set_typ,
|
3086
3085
|
)
|
3087
3086
|
return True
|
@@ -3310,7 +3309,7 @@ if __name__ == '__main__':
|
|
3310
3309
|
# future_to_function = {
|
3311
3310
|
# executor.submit(
|
3312
3311
|
# func_query,
|
3313
|
-
# months=
|
3312
|
+
# months=1,
|
3314
3313
|
# less_dict=[],
|
3315
3314
|
# ),
|
3316
3315
|
# }
|
mdbq/mysql/mysql.py
CHANGED
@@ -4,6 +4,7 @@ import platform
|
|
4
4
|
import getpass
|
5
5
|
import re
|
6
6
|
import time
|
7
|
+
from fileinput import filename
|
7
8
|
from functools import wraps
|
8
9
|
import warnings
|
9
10
|
import pymysql
|
@@ -117,13 +118,203 @@ class MysqlUpload:
|
|
117
118
|
print(f'{func.__name__}, {e}') # 将异常信息返回
|
118
119
|
with open(error_file, 'a') as f:
|
119
120
|
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
120
|
-
f.write(f'\n{now}\n')
|
121
|
+
f.write(f'\n{now} \n')
|
121
122
|
# f.write(f'报错的文件:\n{e.__traceback__.tb_frame.f_globals["__file__"]}\n') # 发生异常所在的文件
|
122
123
|
traceback.print_exc(file=open(error_file, 'a')) # 返回完整的堆栈信息
|
123
124
|
print(f'更多信息请查看日志文件: {error_file}')
|
124
125
|
|
125
126
|
return wrapper
|
126
127
|
|
128
|
+
def cover_doc_dtypes(self, dict_data):
|
129
|
+
""" 清理字典键值 并转换数据类型 """
|
130
|
+
if not dict_data:
|
131
|
+
print(f'mysql.py -> MysqlUpload -> cover_dict_dtypes -> 传入的字典不能为空')
|
132
|
+
return
|
133
|
+
__res_dict = {}
|
134
|
+
new_dict_data = {}
|
135
|
+
for k, v in dict_data.items():
|
136
|
+
k = str(k).lower()
|
137
|
+
k = re.sub(r'[()\-,,$&~^、 ()\"\'“”=·/。》《><!!`]', '_', k, re.IGNORECASE)
|
138
|
+
k = k.replace(')', '')
|
139
|
+
k = re.sub(r'_{2,}', '_', k)
|
140
|
+
k = re.sub(r'_+$', '', k)
|
141
|
+
result1 = re.findall(r'编码|_?id|货号|款号|文件大小', k, re.IGNORECASE)
|
142
|
+
result2 = re.findall(r'占比$|投产$|产出$|roi$|率$', k, re.IGNORECASE)
|
143
|
+
result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
|
144
|
+
result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
|
145
|
+
|
146
|
+
date_type = is_valid_date(v) # 判断日期时间
|
147
|
+
int_num = is_integer(v) # 判断整数
|
148
|
+
count_int, count_float = count_decimal_places(v) # 判断小数,返回小数位数
|
149
|
+
if result1: # 京东sku/spu商品信息
|
150
|
+
__res_dict.update({k: 'varchar(100)'})
|
151
|
+
elif k == '日期':
|
152
|
+
__res_dict.update({k: 'DATE'})
|
153
|
+
elif k == '更新时间':
|
154
|
+
__res_dict.update({k: 'TIMESTAMP'})
|
155
|
+
elif result2: # 小数
|
156
|
+
__res_dict.update({k: 'decimal(10,4)'})
|
157
|
+
elif date_type == 1: # 纯日期
|
158
|
+
__res_dict.update({k: 'DATE'})
|
159
|
+
elif date_type == 2: # 日期+时间
|
160
|
+
__res_dict.update({k: 'DATETIME'})
|
161
|
+
elif int_num:
|
162
|
+
__res_dict.update({k: 'INT'})
|
163
|
+
elif count_float > 0:
|
164
|
+
if count_int + count_float > 10:
|
165
|
+
# if count_float > 5:
|
166
|
+
# v = round(float(v), 4)
|
167
|
+
if count_float >= 6:
|
168
|
+
__res_dict.update({k: 'decimal(14,6)'})
|
169
|
+
else:
|
170
|
+
__res_dict.update({k: 'decimal(14,4)'})
|
171
|
+
elif count_float >= 6:
|
172
|
+
__res_dict.update({k: 'decimal(14,6)'})
|
173
|
+
elif count_float >= 4:
|
174
|
+
__res_dict.update({k: 'decimal(12,4)'})
|
175
|
+
else:
|
176
|
+
__res_dict.update({k: 'decimal(10,2)'})
|
177
|
+
else:
|
178
|
+
__res_dict.update({k: 'varchar(255)'})
|
179
|
+
new_dict_data.update({k: v})
|
180
|
+
__res_dict.update({'数据主体': 'longblob'})
|
181
|
+
return __res_dict, new_dict_data
|
182
|
+
|
183
|
+
# @try_except
|
184
|
+
def doc_to_sql(self, db_name, table_name, dict_data, set_typ={}, remove_by_key=None, allow_not_null=False, filename=None, reset_id=False):
|
185
|
+
"""
|
186
|
+
db_name:
|
187
|
+
table_name:
|
188
|
+
remove_by_key: 设置时先删除数据再插入,不设置则直接添加
|
189
|
+
dict_data:
|
190
|
+
set_typ:
|
191
|
+
allow_not_null:
|
192
|
+
filename:
|
193
|
+
reset_id:
|
194
|
+
"""
|
195
|
+
if '数据主体' not in dict_data.keys():
|
196
|
+
print(f'dict_data 中"数据主体"键不能为空')
|
197
|
+
return
|
198
|
+
|
199
|
+
connection = pymysql.connect(**self.config) # 连接数据库
|
200
|
+
with connection.cursor() as cursor:
|
201
|
+
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
202
|
+
database_exists = cursor.fetchone()
|
203
|
+
if not database_exists:
|
204
|
+
# 如果数据库不存在,则新建
|
205
|
+
if '8.138.27' in str(self.host) or platform.system() == "Linux": # 阿里云 mysql 低版本不支持 0900
|
206
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_unicode_ci"
|
207
|
+
self.config.update({'charset': 'utf8mb4_unicode_ci'})
|
208
|
+
if '192.168.1.100' in str(self.host):
|
209
|
+
sql = f"CREATE DATABASE `{db_name}`"
|
210
|
+
else:
|
211
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
212
|
+
cursor.execute(sql)
|
213
|
+
connection.commit()
|
214
|
+
print(f"创建Database: {db_name}")
|
215
|
+
|
216
|
+
self.config.update({'database': db_name}) # 添加更新 config 字段
|
217
|
+
connection = pymysql.connect(**self.config) # 重新连接数据库
|
218
|
+
with connection.cursor() as cursor:
|
219
|
+
# 1. 查询表, 不存在则创建一个空表
|
220
|
+
sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
|
221
|
+
cursor.execute(sql, (table_name))
|
222
|
+
if not cursor.fetchone():
|
223
|
+
sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (id INT AUTO_INCREMENT PRIMARY KEY);"
|
224
|
+
cursor.execute(sql)
|
225
|
+
print(f'创建 mysql 表: {table_name}')
|
226
|
+
|
227
|
+
new_dict = {}
|
228
|
+
[new_dict.update({k: v}) for k, v in dict_data.items() if k != '数据主体']
|
229
|
+
# 清理列名中的非法字符
|
230
|
+
dtypes, new_dict = self.cover_doc_dtypes(new_dict)
|
231
|
+
if set_typ:
|
232
|
+
# 更新自定义的列数据类型
|
233
|
+
for k, v in dtypes.items():
|
234
|
+
# 确保传进来的 set_typ 键存在于实际的 df 列才 update
|
235
|
+
[dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
|
236
|
+
|
237
|
+
# 检查列
|
238
|
+
sql = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s;"
|
239
|
+
cursor.execute(sql, (db_name, table_name))
|
240
|
+
col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()] # 已存在的所有列
|
241
|
+
|
242
|
+
col_not_exist = [col for col in set_typ.keys() if col not in col_exist] # 不存在的列
|
243
|
+
# 不存在则新建列
|
244
|
+
if col_not_exist: # 数据表中不存在的列
|
245
|
+
for col in col_not_exist:
|
246
|
+
# 创建列,需转义
|
247
|
+
if allow_not_null:
|
248
|
+
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {set_typ[col]};"
|
249
|
+
else:
|
250
|
+
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {set_typ[col]} NOT NULL;"
|
251
|
+
cursor.execute(sql)
|
252
|
+
print(f"添加列: {col}({set_typ[col]})") # 添加列并指定数据类型
|
253
|
+
|
254
|
+
if col == '日期':
|
255
|
+
sql = f"CREATE INDEX index_name ON `{table_name}`(`{col}`);"
|
256
|
+
print(f"设置为索引: {col}({set_typ[col]})")
|
257
|
+
cursor.execute(sql)
|
258
|
+
connection.commit() # 提交事务
|
259
|
+
|
260
|
+
if remove_by_key:
|
261
|
+
# 删除数据
|
262
|
+
se_key = ', '.join(remove_by_key)
|
263
|
+
condition = []
|
264
|
+
for up_col in remove_by_key:
|
265
|
+
condition += [f'`{up_col}` = "{dict_data[up_col]}"']
|
266
|
+
condition = ' AND '.join(condition)
|
267
|
+
# print(condition)
|
268
|
+
sql = f"SELECT {se_key} FROM `{table_name}` WHERE {condition}"
|
269
|
+
cursor.execute(sql)
|
270
|
+
result = cursor.fetchall()
|
271
|
+
if result:
|
272
|
+
sql = f'DELETE FROM `{table_name}` WHERE {condition};'
|
273
|
+
cursor.execute(sql)
|
274
|
+
|
275
|
+
# 插入数据到数据库
|
276
|
+
# 有数据格式错误问题,所以分开处理,将数据主体移到最后面用占位符
|
277
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
278
|
+
print(f'{now} 正在更新: mysql ({self.host}:{self.port}) {db_name}/{table_name} -> {filename}')
|
279
|
+
if new_dict:
|
280
|
+
cols = ', '.join(f"`{item}`" for item in new_dict.keys()) # 列名需要转义
|
281
|
+
values = ', '.join([f'"{item}"' for item in new_dict.values()]) # 值要加引号
|
282
|
+
cols = ', '.join([cols, '数据主体'])
|
283
|
+
binary_data = dict_data['数据主体']
|
284
|
+
sql = f"INSERT INTO `{table_name}` ({cols}) VALUES ({values}, %s)"
|
285
|
+
# print(sql)
|
286
|
+
cursor.execute(sql, binary_data)
|
287
|
+
else:
|
288
|
+
sql = f"""INSERT INTO `{table_name}` (数据主体) VALUES (%s);"""
|
289
|
+
cursor.execute(sql, dict_data['数据主体'])
|
290
|
+
|
291
|
+
if reset_id:
|
292
|
+
# 6. 重置自增列
|
293
|
+
try:
|
294
|
+
# 查询所有复合主键
|
295
|
+
sql = (
|
296
|
+
f"SELECT `COLUMN_NAME` AS `PrimaryKey` FROM `information_schema`.`COLUMNS` "
|
297
|
+
f"WHERE `TABLE_SCHEMA` = '{db_name}'AND `TABLE_NAME` = '{table_name}' AND `COLUMN_KEY` = 'PRI';"
|
298
|
+
)
|
299
|
+
cursor.execute(sql)
|
300
|
+
result = cursor.fetchall() # 复合主键数
|
301
|
+
if len(result) <= 1: # 如果存在复合主键,则不能直接删除 id 键,其他主键可能不是唯一,会报错
|
302
|
+
cursor.execute(f"SHOW COLUMNS FROM {table_name} LIKE 'id'")
|
303
|
+
result = cursor.fetchone()
|
304
|
+
if result:
|
305
|
+
cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN id;") # 删除 id 列
|
306
|
+
cursor.execute(
|
307
|
+
f"ALTER TABLE {table_name} ADD column id INT AUTO_INCREMENT PRIMARY KEY FIRST;")
|
308
|
+
cursor.execute(f"ALTER TABLE {table_name} AUTO_INCREMENT = 1") # 设置自增从 1 开始
|
309
|
+
# print(f'重置自增id')
|
310
|
+
else:
|
311
|
+
print(f'{table_name} 当前表存在复合主键: {result}, 无法重置自增id')
|
312
|
+
except Exception as e:
|
313
|
+
print(f'{e}')
|
314
|
+
connection.rollback()
|
315
|
+
connection.commit()
|
316
|
+
|
317
|
+
|
127
318
|
@try_except
|
128
319
|
def dict_to_mysql(self, db_name, table_name, dict_data, icm_update=None, main_key=None, unique_main_key=None, index_length=100, set_typ=None, allow_not_null=False, cut_data=None):
|
129
320
|
"""
|
@@ -188,7 +379,7 @@ class MysqlUpload:
|
|
188
379
|
print(f'创建 mysql 表: {table_name}')
|
189
380
|
|
190
381
|
# 根据 dict_data 的值添加指定的数据类型
|
191
|
-
dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': '
|
382
|
+
dtypes, dict_data = self.cover_dict_dtypes(dict_data=dict_data) # {'店铺名称': 'varchar(100)',...}
|
192
383
|
if set_typ:
|
193
384
|
# 更新自定义的列数据类型
|
194
385
|
for k, v in dtypes.items():
|
@@ -398,6 +589,9 @@ class MysqlUpload:
|
|
398
589
|
df.replace(to_replace=['"'], value='', regex=True, inplace=True)
|
399
590
|
cols = df.columns.tolist()
|
400
591
|
for col in cols:
|
592
|
+
if col == 'id':
|
593
|
+
df.pop('id')
|
594
|
+
continue
|
401
595
|
df[col] = df[col].apply(lambda x: float(re.sub(r'%$', '', str(x))) / 100 if (
|
402
596
|
str(x) != '' and str(x).endswith('%')) and not re.findall('[\\u4e00-\\u9fa5]', str(x)) else '0.0' if str(x) == '0%' else x)
|
403
597
|
try:
|
@@ -571,9 +765,9 @@ class MysqlUpload:
|
|
571
765
|
connection.commit() # 提交事务
|
572
766
|
|
573
767
|
if df_sql:
|
574
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
768
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
575
769
|
print(
|
576
|
-
f'{now}正在更新: mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count}, {self.filename}')
|
770
|
+
f'{now} 正在更新: mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count}, {self.filename}')
|
577
771
|
engine = create_engine(
|
578
772
|
f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
|
579
773
|
# df.to_csv('/Users/xigua/Downloads/mysql.csv', index=False, header=True, encoding='utf-8_sig')
|
@@ -587,18 +781,29 @@ class MysqlUpload:
|
|
587
781
|
)
|
588
782
|
if reset_id:
|
589
783
|
pass
|
590
|
-
#
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
784
|
+
# 6. 重置自增列
|
785
|
+
try:
|
786
|
+
# 查询所有复合主键
|
787
|
+
sql = (
|
788
|
+
f"SELECT `COLUMN_NAME` AS `PrimaryKey` FROM `information_schema`.`COLUMNS` "
|
789
|
+
f"WHERE `TABLE_SCHEMA` = '{db_name}'AND `TABLE_NAME` = '{table_name}' AND `COLUMN_KEY` = 'PRI';"
|
790
|
+
)
|
791
|
+
cursor.execute(sql)
|
792
|
+
result = cursor.fetchall() # 复合主键数
|
793
|
+
if len(result) <= 1: # 如果存在复合主键,则不能直接删除 id 键,其他主键可能不是唯一,会报错
|
794
|
+
cursor.execute(f"SHOW COLUMNS FROM {table_name} LIKE 'id'")
|
795
|
+
result = cursor.fetchone()
|
796
|
+
if result:
|
797
|
+
cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN id;") # 删除 id 列
|
798
|
+
cursor.execute(
|
799
|
+
f"ALTER TABLE {table_name} ADD column id INT AUTO_INCREMENT PRIMARY KEY FIRST;")
|
800
|
+
cursor.execute(f"ALTER TABLE {table_name} AUTO_INCREMENT = 1") # 设置自增从 1 开始
|
801
|
+
# print(f'重置自增id')
|
802
|
+
else:
|
803
|
+
print(f'{table_name} 当前表存在复合主键: {result}, 无法重置自增id')
|
804
|
+
except Exception as e:
|
805
|
+
print(f'{e}')
|
806
|
+
connection.rollback()
|
602
807
|
connection.commit() # 提交事务
|
603
808
|
connection.close()
|
604
809
|
return
|
@@ -607,8 +812,11 @@ class MysqlUpload:
|
|
607
812
|
if move_insert and '日期' in df.columns.tolist():
|
608
813
|
# 移除数据
|
609
814
|
dates = df['日期'].values.tolist()
|
815
|
+
# print(dates)
|
816
|
+
dates = [pd.to_datetime(item) for item in dates] # 需要先转换类型才能用 min, max
|
610
817
|
start_date = pd.to_datetime(min(dates)).strftime('%Y-%m-%d')
|
611
818
|
end_date = (pd.to_datetime(max(dates)) + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
|
819
|
+
|
612
820
|
sql = f"DELETE FROM `{table_name}` WHERE {'日期'} BETWEEN '%s' AND '%s'" % (start_date, end_date)
|
613
821
|
cursor.execute(sql)
|
614
822
|
connection.commit()
|
@@ -625,18 +833,28 @@ class MysqlUpload:
|
|
625
833
|
)
|
626
834
|
# 6. 重置自增列
|
627
835
|
if reset_id:
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
836
|
+
try:
|
837
|
+
# 查询所有复合主键
|
838
|
+
sql = (
|
839
|
+
f"SELECT `COLUMN_NAME` AS `PrimaryKey` FROM `information_schema`.`COLUMNS` "
|
840
|
+
f"WHERE `TABLE_SCHEMA` = '{db_name}'AND `TABLE_NAME` = '{table_name}' AND `COLUMN_KEY` = 'PRI';"
|
841
|
+
)
|
842
|
+
cursor.execute(sql)
|
843
|
+
result = cursor.fetchall() # 复合主键数
|
844
|
+
if len(result) <= 1: # 如果存在复合主键,则不能直接删除 id 键,其他主键可能不是唯一,会报错
|
845
|
+
cursor.execute(f"SHOW COLUMNS FROM {table_name} LIKE 'id'")
|
846
|
+
result = cursor.fetchone()
|
847
|
+
if result:
|
848
|
+
cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN id;") # 删除 id 列
|
849
|
+
cursor.execute(
|
850
|
+
f"ALTER TABLE {table_name} ADD column id INT AUTO_INCREMENT PRIMARY KEY FIRST;")
|
851
|
+
cursor.execute(f"ALTER TABLE {table_name} AUTO_INCREMENT = 1") # 设置自增从 1 开始
|
852
|
+
# print(f'重置自增id')
|
853
|
+
else:
|
854
|
+
print(f'{table_name} 当前表存在复合主键: {result}, 无法重置自增id')
|
855
|
+
except Exception as e:
|
856
|
+
print(f'{e}')
|
857
|
+
connection.rollback()
|
640
858
|
connection.close()
|
641
859
|
return
|
642
860
|
|
@@ -730,22 +948,78 @@ class MysqlUpload:
|
|
730
948
|
# print(f'mysql -> df_to_mysql 报错: {e}, {self.filename}')
|
731
949
|
# breakpoint()
|
732
950
|
|
733
|
-
#
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
951
|
+
# 6. 重置自增列
|
952
|
+
if reset_id:
|
953
|
+
try:
|
954
|
+
# 查询所有复合主键
|
955
|
+
sql = (
|
956
|
+
f"SELECT `COLUMN_NAME` AS `PrimaryKey` FROM `information_schema`.`COLUMNS` "
|
957
|
+
f"WHERE `TABLE_SCHEMA` = '{db_name}'AND `TABLE_NAME` = '{table_name}' AND `COLUMN_KEY` = 'PRI';"
|
958
|
+
)
|
959
|
+
cursor.execute(sql)
|
960
|
+
result = cursor.fetchall() # 复合主键数
|
961
|
+
if len(result) <= 1: # 如果存在复合主键,则不能直接删除 id 键,其他主键可能不是唯一,会报错
|
962
|
+
cursor.execute(f"SHOW COLUMNS FROM {table_name} LIKE 'id'")
|
963
|
+
result = cursor.fetchone()
|
964
|
+
if result:
|
965
|
+
cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN id;") # 删除 id 列
|
966
|
+
cursor.execute(
|
967
|
+
f"ALTER TABLE {table_name} ADD column id INT AUTO_INCREMENT PRIMARY KEY FIRST;")
|
968
|
+
cursor.execute(f"ALTER TABLE {table_name} AUTO_INCREMENT = 1") # 设置自增从 1 开始
|
969
|
+
# print(f'重置自增id')
|
970
|
+
else:
|
971
|
+
print(f'{table_name} 当前表存在复合主键: {result}, 无法重置自增id')
|
972
|
+
except Exception as e:
|
973
|
+
print(f'{e}')
|
974
|
+
connection.rollback()
|
745
975
|
connection.commit() # 提交事务
|
746
976
|
connection.close()
|
747
977
|
|
748
|
-
|
978
|
+
@try_except
|
979
|
+
def read_doc_data(self, table_name, db_name='pdf文件', column='文件名', filename=None, save_path='/Users/xigua/Downloads'):
|
980
|
+
"""
|
981
|
+
db_name:
|
982
|
+
table_name:
|
983
|
+
column: 读取哪一列
|
984
|
+
filename: 文件名称
|
985
|
+
save_path: 保存位置
|
986
|
+
"""
|
987
|
+
if not filename:
|
988
|
+
print(f'未指定文件名: filename')
|
989
|
+
return
|
990
|
+
connection = pymysql.connect(**self.config) # 连接数据库
|
991
|
+
# try:
|
992
|
+
with connection.cursor() as cursor:
|
993
|
+
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
994
|
+
database_exists = cursor.fetchone()
|
995
|
+
if not database_exists:
|
996
|
+
print(f"Database {db_name} 数据库不存在")
|
997
|
+
return
|
998
|
+
self.config.update({'database': db_name})
|
999
|
+
connection = pymysql.connect(**self.config) # 重新连接数据库
|
1000
|
+
with connection.cursor() as cursor:
|
1001
|
+
# 1. 查询表
|
1002
|
+
sql = "SHOW TABLES LIKE %s;" # 有特殊字符不需转义
|
1003
|
+
cursor.execute(sql, (table_name))
|
1004
|
+
if not cursor.fetchone():
|
1005
|
+
print(f'{table_name} -> 数据表不存在')
|
1006
|
+
return
|
1007
|
+
|
1008
|
+
# 读取数据
|
1009
|
+
condition = f'`{column}` = "{filename}"'
|
1010
|
+
sql = f"SELECT `{column}`, `数据主体` FROM `{table_name}` WHERE {condition}"
|
1011
|
+
cursor.execute(sql)
|
1012
|
+
results = cursor.fetchall()
|
1013
|
+
if results:
|
1014
|
+
for result in results:
|
1015
|
+
# 将二进制数据写入到文件
|
1016
|
+
with open(os.path.join(save_path, filename), 'wb') as f:
|
1017
|
+
f.write(result['数据主体'])
|
1018
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1019
|
+
print(f'{now} 写入本地文件: ({self.host}:{self.port}) {db_name}/{table_name} -> {os.path.join(save_path, filename)}')
|
1020
|
+
connection.close()
|
1021
|
+
|
1022
|
+
|
749
1023
|
def read_mysql(self, table_name, start_date, end_date, db_name='远程数据源', date_name='日期'):
|
750
1024
|
""" 读取指定数据表,可指定日期范围,返回结果: df """
|
751
1025
|
start_date = pd.to_datetime(start_date).strftime('%Y-%m-%d')
|
@@ -761,8 +1035,8 @@ class MysqlUpload:
|
|
761
1035
|
print(f"Database {db_name} 数据库不存在")
|
762
1036
|
return df
|
763
1037
|
else:
|
764
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
765
|
-
print(f'{now}mysql 正在查询表: {table_name}, 范围: {start_date}~{end_date}')
|
1038
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1039
|
+
print(f'{now} mysql 正在查询表: {table_name}, 范围: {start_date}~{end_date}')
|
766
1040
|
except:
|
767
1041
|
return df
|
768
1042
|
finally:
|
@@ -789,11 +1063,11 @@ class MysqlUpload:
|
|
789
1063
|
if len(df) == 0:
|
790
1064
|
print(f'database: {db_name}, table: {table_name} 查询的数据为空')
|
791
1065
|
else:
|
792
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
1066
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
793
1067
|
cost_time = int(time.time() - before_time)
|
794
1068
|
if cost_time < 1:
|
795
1069
|
cost_time = round(time.time() - before_time, 2)
|
796
|
-
print(f'{now}mysql ({self.host}) 表: {table_name} 获取数据长度: {len(df)}, 用时: {cost_time} 秒')
|
1070
|
+
print(f'{now} mysql ({self.host}) 表: {table_name} 获取数据长度: {len(df)}, 用时: {cost_time} 秒')
|
797
1071
|
return df
|
798
1072
|
|
799
1073
|
def upload_pandas(self, update_path, db_name, days=None):
|
@@ -821,8 +1095,8 @@ class MysqlUpload:
|
|
821
1095
|
if name.endswith('.csv') and 'baidu' not in name:
|
822
1096
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
823
1097
|
# if '日期' not in df.columns.tolist():
|
824
|
-
# now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
825
|
-
# print(f'{now}{root_file} 缺少日期列, 不支持上传 mysql')
|
1098
|
+
# now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1099
|
+
# print(f'{now} {root_file} 缺少日期列, 不支持上传 mysql')
|
826
1100
|
# continue
|
827
1101
|
if '日期' in df.columns.tolist():
|
828
1102
|
df['日期'] = df['日期'].apply(lambda x: pd.to_datetime(x) if x else x)
|
@@ -834,8 +1108,8 @@ class MysqlUpload:
|
|
834
1108
|
if f_path.endswith('.csv') and 'baidu' not in f_path:
|
835
1109
|
df = pd.read_csv(f_path, encoding='utf-8_sig', header=0, na_filter=False)
|
836
1110
|
# if '日期' not in df.columns.tolist():
|
837
|
-
# now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
838
|
-
# print(f'{now}{root_file} 缺少日期列, 不支持上传 mysql')
|
1111
|
+
# now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1112
|
+
# print(f'{now} {root_file} 缺少日期列, 不支持上传 mysql')
|
839
1113
|
# continue
|
840
1114
|
if '日期' not in df.columns.tolist():
|
841
1115
|
df['日期'] = df['日期'].apply(lambda x: pd.to_datetime(x) if x else x)
|
@@ -886,7 +1160,7 @@ class OptimizeDatas:
|
|
886
1160
|
print(f'{func.__name__}, {e}') # 将异常信息返回
|
887
1161
|
with open(error_file, 'a') as f:
|
888
1162
|
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
889
|
-
f.write(f'\n{now}\n')
|
1163
|
+
f.write(f'\n{now} \n')
|
890
1164
|
# f.write(f'报错的文件:\n{e.__traceback__.tb_frame.f_globals["__file__"]}\n') # 发生异常所在的文件
|
891
1165
|
traceback.print_exc(file=open(error_file, 'a')) # 返回完整的堆栈信息
|
892
1166
|
print(f'更多信息请查看日志文件: {error_file}')
|
@@ -899,8 +1173,8 @@ class OptimizeDatas:
|
|
899
1173
|
需要设置 self.db_name_lists
|
900
1174
|
"""
|
901
1175
|
if not self.db_name_lists:
|
902
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
903
|
-
print(f'{now}尚未设置参数: self.db_name_lists')
|
1176
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1177
|
+
print(f'{now} 尚未设置参数: self.db_name_lists')
|
904
1178
|
return
|
905
1179
|
for db_name in self.db_name_lists:
|
906
1180
|
self.db_name = db_name
|
@@ -909,13 +1183,13 @@ class OptimizeDatas:
|
|
909
1183
|
def optimize(self, except_key=['更新时间']):
|
910
1184
|
""" 更新一个数据库 移除冗余数据 """
|
911
1185
|
if not self.db_name:
|
912
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
913
|
-
print(f'{now}尚未设置参数: self.db_name')
|
1186
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1187
|
+
print(f'{now} 尚未设置参数: self.db_name')
|
914
1188
|
return
|
915
1189
|
tables = self.table_list(db_name=self.db_name)
|
916
1190
|
if not tables:
|
917
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
918
|
-
print(f'{now}{self.db_name} -> 数据表不存在')
|
1191
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1192
|
+
print(f'{now} {self.db_name} -> 数据表不存在')
|
919
1193
|
return
|
920
1194
|
|
921
1195
|
# 日期初始化
|
@@ -932,8 +1206,8 @@ class OptimizeDatas:
|
|
932
1206
|
start_date_before = self.start_date
|
933
1207
|
end_date_before = self.end_date
|
934
1208
|
|
935
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
936
|
-
print(f'{now}mysql({self.host}: {self.port}) {self.db_name} 数据库优化中(日期长度: {self.days} 天)...')
|
1209
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1210
|
+
print(f'{now} mysql({self.host}: {self.port}) {self.db_name} 数据库优化中(日期长度: {self.days} 天)...')
|
937
1211
|
for table_dict in tables:
|
938
1212
|
for key, table_name in table_dict.items():
|
939
1213
|
# if '店铺指标' not in table_name:
|
@@ -946,8 +1220,8 @@ class OptimizeDatas:
|
|
946
1220
|
cursor.execute(sql)
|
947
1221
|
result = cursor.fetchone()
|
948
1222
|
if not result:
|
949
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
950
|
-
print(f'{now}数据表: {table_name}, 数据长度为 0')
|
1223
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1224
|
+
print(f'{now} 数据表: {table_name}, 数据长度为 0')
|
951
1225
|
continue # 检查数据表是否为空
|
952
1226
|
|
953
1227
|
cursor.execute(f"SHOW FULL COLUMNS FROM `{table_name}`") # 查询数据表的列信息
|
@@ -979,21 +1253,32 @@ class OptimizeDatas:
|
|
979
1253
|
else: # 不存在日期列的情况
|
980
1254
|
self.delete_duplicate2(table_name=table_name, except_key=except_key)
|
981
1255
|
|
982
|
-
#
|
983
|
-
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
1256
|
+
# 6. 重置自增列
|
1257
|
+
try:
|
1258
|
+
# 查询所有复合主键
|
1259
|
+
sql = (
|
1260
|
+
f"SELECT `COLUMN_NAME` AS `PrimaryKey` FROM `information_schema`.`COLUMNS` "
|
1261
|
+
f"WHERE `TABLE_SCHEMA` = '{self.db_name}'AND `TABLE_NAME` = '{table_name}' AND `COLUMN_KEY` = 'PRI';"
|
1262
|
+
)
|
1263
|
+
cursor.execute(sql)
|
1264
|
+
result = cursor.fetchall() # 复合主键数
|
1265
|
+
if len(result) <= 1: # 如果存在复合主键,则不能直接删除 id 键,其他主键可能不是唯一,会报错
|
1266
|
+
cursor.execute(f"SHOW COLUMNS FROM {table_name} LIKE 'id'")
|
1267
|
+
result = cursor.fetchone()
|
1268
|
+
if result:
|
1269
|
+
cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN id;") # 删除 id 列
|
1270
|
+
cursor.execute(
|
1271
|
+
f"ALTER TABLE {table_name} ADD column id INT AUTO_INCREMENT PRIMARY KEY FIRST;")
|
1272
|
+
cursor.execute(f"ALTER TABLE {table_name} AUTO_INCREMENT = 1") # 设置自增从 1 开始
|
1273
|
+
# print(f'重置自增id')
|
1274
|
+
else:
|
1275
|
+
print(f'{table_name} 当前表存在复合主键: {result}, 无法重置自增id')
|
1276
|
+
except Exception as e:
|
1277
|
+
print(f'{e}')
|
1278
|
+
self.connection.rollback()
|
994
1279
|
self.connection.close()
|
995
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
996
|
-
print(f'{now}mysql({self.host}: {self.port}) {self.db_name} 数据库优化完成!')
|
1280
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1281
|
+
print(f'{now} mysql({self.host}: {self.port}) {self.db_name} 数据库优化完成!')
|
997
1282
|
|
998
1283
|
def delete_duplicate(self, table_name, date, except_key=['更新时间']):
|
999
1284
|
datas = self.table_datas(db_name=self.db_name, table_name=str(table_name), date=date)
|
@@ -1026,8 +1311,8 @@ class OptimizeDatas:
|
|
1026
1311
|
# 移除冗余数据
|
1027
1312
|
sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
|
1028
1313
|
cursor.execute(sql, duplicate_id)
|
1029
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
1030
|
-
print(f"{now}{table_name} -> {date.strftime('%Y-%m-%d')} before: {len(datas)}, remove: {cursor.rowcount}")
|
1314
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1315
|
+
print(f"{now} {table_name} -> {date.strftime('%Y-%m-%d')} before: {len(datas)}, remove: {cursor.rowcount}")
|
1031
1316
|
self.connection.commit() # 提交事务
|
1032
1317
|
except Exception as e:
|
1033
1318
|
print(f'{self.db_name}/{table_name}, {e}')
|
@@ -1064,8 +1349,8 @@ class OptimizeDatas:
|
|
1064
1349
|
# 移除冗余数据
|
1065
1350
|
sql = f"DELETE FROM `{table_name}` WHERE id IN ({placeholders})"
|
1066
1351
|
cursor.execute(sql, duplicate_id)
|
1067
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
1068
|
-
print(f"{now}{table_name} -> before: {len(datas)}, "
|
1352
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1353
|
+
print(f"{now} {table_name} -> before: {len(datas)}, "
|
1069
1354
|
f"remove: {cursor.rowcount}")
|
1070
1355
|
self.connection.commit() # 提交事务
|
1071
1356
|
except Exception as e:
|
@@ -1089,8 +1374,8 @@ class OptimizeDatas:
|
|
1089
1374
|
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
1090
1375
|
database_exists = cursor.fetchone()
|
1091
1376
|
if not database_exists:
|
1092
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S
|
1093
|
-
print(f'{now}{db_name}: 数据表不存在!')
|
1377
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1378
|
+
print(f'{now} {db_name}: 数据表不存在!')
|
1094
1379
|
return
|
1095
1380
|
except Exception as e:
|
1096
1381
|
print(f'002 {e}')
|
@@ -1,11 +1,11 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
2
|
mdbq/__version__.py,sha256=y9Mp_8x0BCZSHsdLT_q5tX9wZwd5QgqrSIENLrb6vXA,62
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/aggregation.py,sha256=
|
4
|
+
mdbq/aggregation/aggregation.py,sha256=cVp7MLFOSOAtfuCqjZYW7S3mEdw2Gc_jEdqCeWz7qh0,80264
|
5
5
|
mdbq/aggregation/df_types.py,sha256=U9i3q2eRPTDY8qAPTw7irzu-Tlg4CIySW9uYro81wdk,8125
|
6
6
|
mdbq/aggregation/mysql_types.py,sha256=YTGyrF9vcRgfkQbpT-e-JdJ7c7VF1dDHgyx9YZRES8w,10934
|
7
7
|
mdbq/aggregation/optimize_data.py,sha256=RXIv7cACCgYyehAxMjUYi_S7rVyjIwXKWMaM3nduGtA,3068
|
8
|
-
mdbq/aggregation/query_data.py,sha256=
|
8
|
+
mdbq/aggregation/query_data.py,sha256=2--y1VNYhL7lCeVA9WjIHiz3K_2JYm9agFqWd5jaeIc,148341
|
9
9
|
mdbq/aggregation/query_data_bak.py,sha256=r1FU0C4zjXln7oVSrRkElh4Ehl-9mYhGcq57jLbViUA,104071
|
10
10
|
mdbq/aggregation/query_data_bak20241124.py,sha256=oY95ZK3qt3Wx9pdZKZ5cvDh45Yi5yGj1kl8G6riumHA,144513
|
11
11
|
mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
|
@@ -28,7 +28,7 @@ mdbq/log/mylogger.py,sha256=oaT7Bp-Hb9jZt52seP3ISUuxVcI19s4UiqTeouScBO0,3258
|
|
28
28
|
mdbq/mongo/__init__.py,sha256=SILt7xMtQIQl_m-ik9WLtJSXIVf424iYgCfE_tnQFbw,13
|
29
29
|
mdbq/mongo/mongo.py,sha256=v9qvrp6p1ZRWuPpbSilqveiE0FEcZF7U5xUPI0RN4xs,31880
|
30
30
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
31
|
-
mdbq/mysql/mysql.py,sha256=
|
31
|
+
mdbq/mysql/mysql.py,sha256=ZG6BMfoXg6YGnHqv7GfwPwd7RXjoetCAFqPnbdHWqOM,79507
|
32
32
|
mdbq/mysql/recheck_mysql.py,sha256=rgTpvDMWYTyEn7UQdlig-pdXDluTgiU8JG6lkMh8DV0,8665
|
33
33
|
mdbq/mysql/s_query.py,sha256=MbIprZ4yJDAZ9AahZPzl7hqS695Vs0P-AJNwAtA_EEc,9287
|
34
34
|
mdbq/mysql/year_month_day.py,sha256=VgewoE2pJxK7ErjfviL_SMTN77ki8GVbTUcao3vFUCE,1523
|
@@ -46,7 +46,7 @@ mdbq/req_post/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
|
46
46
|
mdbq/req_post/req_tb.py,sha256=qg7pet73IgKGmCwxaeUyImJIoeK_pBQT9BBKD7fkBNg,36160
|
47
47
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
48
48
|
mdbq/spider/aikucun.py,sha256=nIKKZOZbemKqcrikcrMmtksLgJjjzeU0I99teBgU1jE,22439
|
49
|
-
mdbq-3.2.
|
50
|
-
mdbq-3.2.
|
51
|
-
mdbq-3.2.
|
52
|
-
mdbq-3.2.
|
49
|
+
mdbq-3.2.12.dist-info/METADATA,sha256=W62uxvamVOW_S6O91kqwl5N36Nh8QzvKHF-C5ZyiD-w,244
|
50
|
+
mdbq-3.2.12.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
51
|
+
mdbq-3.2.12.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
52
|
+
mdbq-3.2.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|