mdbq 1.9.0__py3-none-any.whl → 1.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/aggregation.py +35 -9
- mdbq/aggregation/query_data.py +55 -3
- mdbq/clean/data_clean.py +903 -873
- mdbq/mysql/mysql.py +3 -0
- {mdbq-1.9.0.dist-info → mdbq-1.9.2.dist-info}/METADATA +1 -1
- {mdbq-1.9.0.dist-info → mdbq-1.9.2.dist-info}/RECORD +8 -8
- {mdbq-1.9.0.dist-info → mdbq-1.9.2.dist-info}/WHEEL +0 -0
- {mdbq-1.9.0.dist-info → mdbq-1.9.2.dist-info}/top_level.txt +0 -0
mdbq/aggregation/aggregation.py
CHANGED
@@ -61,7 +61,6 @@ class DatabaseUpdate:
|
|
61
61
|
for name in files:
|
62
62
|
if '~$' in name or '.DS' in name or '.localized' in name or '.ini' in name or '$RECYCLE.BIN' in name or 'Icon' in name:
|
63
63
|
continue
|
64
|
-
|
65
64
|
db_name = None # 初始化/重置变量,避免进入下一个循环
|
66
65
|
collection_name = None
|
67
66
|
for data in datas: # 根据标题对照表适配 db_name 和 collection_name
|
@@ -189,6 +188,22 @@ class DatabaseUpdate:
|
|
189
188
|
collection_name='店铺来源_日数据_旧版'
|
190
189
|
elif name.endswith('.csv') and '客户运营平台_客户列表' in name:
|
191
190
|
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
191
|
+
elif name.endswith('.xlsx') and '直播分场次效果' in name:
|
192
|
+
pattern = re.findall(r'(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})', name)
|
193
|
+
if pattern:
|
194
|
+
continue
|
195
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
196
|
+
if len(df) == 0:
|
197
|
+
print(f'{name} 报表数据为空')
|
198
|
+
continue
|
199
|
+
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
200
|
+
df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
201
|
+
df['直播开播时间'] = pd.to_datetime(df['直播开播时间'], format='%Y-%m-%d %H:%M:%S', errors='ignore')
|
202
|
+
df.insert(loc=0, column='日期', value=df['直播开播时间'])
|
203
|
+
df['日期'] = df['日期'].apply(
|
204
|
+
lambda x: pd.to_datetime(str(x).split(' ')[0], format='%Y-%m-%d', errors='ignore') if x else x)
|
205
|
+
df.insert(loc=1, column='店铺', value='万里马官方旗舰店')
|
206
|
+
|
192
207
|
elif name.endswith('.xls') and '生意参谋' in name and '无线店铺三级流量来源详情' in name:
|
193
208
|
# 店铺来源,手淘搜索,关键词
|
194
209
|
pattern = re.findall(r'(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})', name)
|
@@ -1115,12 +1130,23 @@ if __name__ == '__main__':
|
|
1115
1130
|
# database='mysql'
|
1116
1131
|
# )
|
1117
1132
|
|
1118
|
-
db_name = '生意经2'
|
1119
|
-
table_name = '省份城市分析'
|
1120
|
-
upload_dir(
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
)
|
1133
|
+
# db_name = '生意经2'
|
1134
|
+
# table_name = '省份城市分析'
|
1135
|
+
# upload_dir(
|
1136
|
+
# path='/Users/xigua/数据中心/原始文件2/生意经/地域分布',
|
1137
|
+
# db_name=db_name,
|
1138
|
+
# collection_name=table_name,
|
1139
|
+
# dbs={'mysql': True, 'mongodb': False},
|
1140
|
+
# )
|
1141
|
+
#
|
1126
1142
|
|
1143
|
+
# 新版 数据分类
|
1144
|
+
dp = DatabaseUpdate(path='/Users/xigua/Downloads')
|
1145
|
+
dp.new_unzip(is_move=True)
|
1146
|
+
dp.cleaning(is_move=False) # 清洗数据, 存入 self.datas, 不需要立即移除文件,仍保留文件到原始文件中
|
1147
|
+
# 将 self.datas 更新至数据库
|
1148
|
+
dp.upload_df(service_databases=[
|
1149
|
+
# {'home_lx': 'mongodb'},
|
1150
|
+
{'company': 'mysql'},
|
1151
|
+
# {'nas': 'mysql'},
|
1152
|
+
])
|
mdbq/aggregation/query_data.py
CHANGED
@@ -454,6 +454,50 @@ class MysqlDatasQuery:
|
|
454
454
|
)
|
455
455
|
return df
|
456
456
|
|
457
|
+
def zb_ccfx(self):
|
458
|
+
start_date, end_date = self.months_data(num=self.months)
|
459
|
+
projection = {
|
460
|
+
'日期': 1,
|
461
|
+
'店铺': 1,
|
462
|
+
'场次信息': 1,
|
463
|
+
'场次id': 1,
|
464
|
+
'直播开播时间': 1,
|
465
|
+
'开播时长': 1,
|
466
|
+
'封面图点击率': 1,
|
467
|
+
'观看人数': 1,
|
468
|
+
'观看次数': 1,
|
469
|
+
'新增粉丝数': 1,
|
470
|
+
'流量券消耗': 1,
|
471
|
+
'观看总时长(秒)': 1,
|
472
|
+
'人均观看时长(秒)': 1,
|
473
|
+
'次均观看时长(秒)': 1,
|
474
|
+
'商品点击人数': 1,
|
475
|
+
'商品点击次数': 1,
|
476
|
+
'商品点击率': 1,
|
477
|
+
'加购人数': 1,
|
478
|
+
'加购件数': 1,
|
479
|
+
'加购次数': 1,
|
480
|
+
'成交金额(元)': 1,
|
481
|
+
'成交人数': 1,
|
482
|
+
'成交件数': 1,
|
483
|
+
'成交笔数': 1,
|
484
|
+
'成交转化率': 1,
|
485
|
+
'退款人数': 1,
|
486
|
+
'退款笔数': 1,
|
487
|
+
'退款件数': 1,
|
488
|
+
'退款金额(元)': 1,
|
489
|
+
'预售定金支付金额(元)': 1,
|
490
|
+
'预售预估总金额(元)': 1,
|
491
|
+
}
|
492
|
+
df = self.download.data_to_df(
|
493
|
+
db_name='生意参谋2',
|
494
|
+
table_name='直播场次分析',
|
495
|
+
start_date=start_date,
|
496
|
+
end_date=end_date,
|
497
|
+
projection=projection,
|
498
|
+
)
|
499
|
+
return df
|
500
|
+
|
457
501
|
class GroupBy:
|
458
502
|
"""
|
459
503
|
数据聚合和导出
|
@@ -1016,6 +1060,9 @@ class GroupBy:
|
|
1016
1060
|
}
|
1017
1061
|
)
|
1018
1062
|
return df
|
1063
|
+
elif '直播场次分析' in table_name:
|
1064
|
+
df.drop_duplicates(subset=['日期', '直播开播时间', '观看人数'], keep='first', inplace=True, ignore_index=True)
|
1065
|
+
return df
|
1019
1066
|
else:
|
1020
1067
|
print(f'<{table_name}>: Groupby 类尚未配置,数据为空')
|
1021
1068
|
return pd.DataFrame({})
|
@@ -1056,7 +1103,6 @@ class GroupBy:
|
|
1056
1103
|
df['毛利率'] = df.apply(lambda x: round((x['销售额'] - x['商品成本']) / x['销售额'], 4) if x['销售额'] > 0 else 0, axis=1)
|
1057
1104
|
df['盈亏'] = df.apply(lambda x: x['商品毛利'] - x['花费'], axis=1)
|
1058
1105
|
return df
|
1059
|
-
|
1060
1106
|
def performance_concat(self, bb_tg=True):
|
1061
1107
|
tg, zb, pxb = self.data_tgyj['天猫汇总表调用'], self.data_tgyj['天猫_超级直播'], self.data_tgyj['天猫_品销宝账户报表']
|
1062
1108
|
zb.rename(columns={
|
@@ -1385,6 +1431,12 @@ def data_aggregation(service_databases=[{}], months=1):
|
|
1385
1431
|
'唯一主键': ['日期', '关键词', '访客数'],
|
1386
1432
|
'数据主体': sdq.tm_search(),
|
1387
1433
|
},
|
1434
|
+
{
|
1435
|
+
'数据库名': '聚合数据',
|
1436
|
+
'集合名': '生意参谋_直播场次分析',
|
1437
|
+
'唯一主键': ['日期', '直播开播时间'],
|
1438
|
+
'数据主体': sdq.zb_ccfx(),
|
1439
|
+
},
|
1388
1440
|
]
|
1389
1441
|
for items in data_dict: # 遍历返回结果
|
1390
1442
|
db_name, table_name, unique_key_list, df = items['数据库名'], items['集合名'], items['唯一主键'], items['数据主体']
|
@@ -1402,7 +1454,7 @@ def data_aggregation(service_databases=[{}], months=1):
|
|
1402
1454
|
service_database=service_database,
|
1403
1455
|
)
|
1404
1456
|
g.sp_index_datas = pd.DataFrame() # 重置,不然下个循环会继续刷入数据库
|
1405
|
-
# g.as_csv(df=df, filename=table_name + '.csv') # 导出 csv
|
1457
|
+
# # g.as_csv(df=df, filename=table_name + '.csv') # 导出 csv
|
1406
1458
|
if '日期' in df.columns.tolist():
|
1407
1459
|
m.df_to_mysql(
|
1408
1460
|
df=df,
|
@@ -1483,7 +1535,7 @@ def main():
|
|
1483
1535
|
|
1484
1536
|
|
1485
1537
|
if __name__ == '__main__':
|
1486
|
-
data_aggregation(service_databases=[{'company': 'mysql'}], months=
|
1538
|
+
data_aggregation(service_databases=[{'company': 'mysql'}], months=1) # 正常的聚合所有数据
|
1487
1539
|
# data_aggregation_one(service_databases=[{'company': 'mysql'}], months=1) # 单独聚合某一个数据库,具体库进函数编辑
|
1488
1540
|
# optimize_data.op_data(service_databases=[{'company': 'mysql'}], days=3650) # 立即启动对聚合数据的清理工作
|
1489
1541
|
|