mdbq 3.7.6__py3-none-any.whl → 3.7.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/optimize_data.py +0 -46
- {mdbq-3.7.6.dist-info → mdbq-3.7.8.dist-info}/METADATA +1 -1
- {mdbq-3.7.6.dist-info → mdbq-3.7.8.dist-info}/RECORD +5 -7
- mdbq/aggregation/aggregation_bak.py +0 -1438
- mdbq/mongo/mongo.py +0 -729
- {mdbq-3.7.6.dist-info → mdbq-3.7.8.dist-info}/WHEEL +0 -0
- {mdbq-3.7.6.dist-info → mdbq-3.7.8.dist-info}/top_level.txt +0 -0
@@ -1,1438 +0,0 @@
|
|
1
|
-
# -*- coding:utf-8 -*-
|
2
|
-
import warnings
|
3
|
-
from unittest.mock import inplace
|
4
|
-
|
5
|
-
import pandas as pd
|
6
|
-
import numpy as np
|
7
|
-
import chardet
|
8
|
-
import zipfile
|
9
|
-
import socket
|
10
|
-
|
11
|
-
from pyzipper import PyZipFile
|
12
|
-
import os
|
13
|
-
import platform
|
14
|
-
import json
|
15
|
-
from mdbq.mongo import mongo
|
16
|
-
from mdbq.mysql import mysql
|
17
|
-
from mdbq.aggregation import df_types
|
18
|
-
from mdbq.config import get_myconf
|
19
|
-
from mdbq.config import set_support
|
20
|
-
from mdbq.config import myconfig
|
21
|
-
from mdbq.dataframe import converter
|
22
|
-
import datetime
|
23
|
-
import time
|
24
|
-
import re
|
25
|
-
import shutil
|
26
|
-
import getpass
|
27
|
-
from sqlalchemy import create_engine
|
28
|
-
warnings.filterwarnings('ignore')
|
29
|
-
"""
|
30
|
-
|
31
|
-
此文件不再更新
|
32
|
-
|
33
|
-
|
34
|
-
1. DatabaseUpdate: 程序用于对爬虫下载的原始数据进行清洗并入库;
|
35
|
-
数据入库时会较检并更新本地 json 文件的 dtypes 信息;
|
36
|
-
若 json 缺失 dtypes 信息, 会按 df 类型自动转换并更新本地 json, 可以手动修改添加本地 json 信息,手动修改优先;
|
37
|
-
2. upload_dir: 函数将一个文件夹上传至数据库;
|
38
|
-
"""
|
39
|
-
|
40
|
-
username, password, host, port, service_database = None, None, None, None, None,
|
41
|
-
if socket.gethostname() in ['xigua_lx', 'xigua1', 'MacBookPro']:
|
42
|
-
conf = myconfig.main()
|
43
|
-
conf_data = conf['Windows']['xigua_lx']['mysql']['local']
|
44
|
-
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data['port']
|
45
|
-
service_database = {'xigua_lx': 'mysql'}
|
46
|
-
elif socket.gethostname() in ['company', 'Mac2.local']:
|
47
|
-
conf = myconfig.main()
|
48
|
-
conf_data = conf['Windows']['company']['mysql']['local']
|
49
|
-
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data['port']
|
50
|
-
service_database = {'company': 'mysql'}
|
51
|
-
if not username:
|
52
|
-
print(f'找不到主机:')
|
53
|
-
|
54
|
-
|
55
|
-
def get_encoding(path):
|
56
|
-
"""
|
57
|
-
获取文件的编码方式, 读取速度比较慢,非必要不要使用
|
58
|
-
"""
|
59
|
-
with open(path, 'rb') as f:
|
60
|
-
f1 = f.read()
|
61
|
-
encod = chardet.detect(f1).get('encoding')
|
62
|
-
return encod
|
63
|
-
|
64
|
-
|
65
|
-
class DatabaseUpdateBak:
|
66
|
-
"""
|
67
|
-
清洗文件,并入库,被 tg.py 调用
|
68
|
-
"""
|
69
|
-
def __init__(self, path):
|
70
|
-
self.path = path # 数据所在目录, 即: 下载文件夹
|
71
|
-
self.datas: list = [] # 带更新进数据库的数据集合
|
72
|
-
self.start_date = '2022-01-01' # 日期表的起始日期
|
73
|
-
|
74
|
-
def cleaning(self, is_move=True, is_except=[]):
|
75
|
-
"""
|
76
|
-
数据清洗, 返回包含 数据库名, 集合名称, 和 df 主体
|
77
|
-
修改 cleaning 时,要同步 support 下的 标题对照表.csv
|
78
|
-
is_except: 需要排除不做处理的文件或文件夹
|
79
|
-
"""
|
80
|
-
if not os.path.exists(self.path):
|
81
|
-
print(f'1.1.0 初始化时传入了不存在的目录: {self.path}')
|
82
|
-
return
|
83
|
-
|
84
|
-
filename = '标题对照表.csv'
|
85
|
-
support_file = set_support.SetSupport(dirname='support').dirname
|
86
|
-
if not os.path.isfile(os.path.join(support_file, filename)):
|
87
|
-
print(f'缺少关键文件支持: {os.path.join(support_file, filename)}')
|
88
|
-
return
|
89
|
-
df = pd.read_csv(os.path.join(support_file, filename), encoding='utf-8_sig', header=0, na_filter=False)
|
90
|
-
datas = df.to_dict('records') # 转字典
|
91
|
-
# print(datas)
|
92
|
-
|
93
|
-
for root, dirs, files in os.walk(self.path, topdown=False):
|
94
|
-
for name in files:
|
95
|
-
check_remove_file = False # 设置这个参数的目的: 避免误删其他文件, 不是本程序数据清洗覆盖的文件不做干预
|
96
|
-
if '~$' in name or '.DS' in name or '.localized' in name or '.ini' in name or '$RECYCLE.BIN' in name or 'Icon' in name:
|
97
|
-
continue
|
98
|
-
is_continue = False
|
99
|
-
if is_except:
|
100
|
-
for item in is_except:
|
101
|
-
if item in os.path.join(root, name):
|
102
|
-
# print(name)
|
103
|
-
is_continue = True
|
104
|
-
break
|
105
|
-
if is_continue: # 需要排除不做处理的文件或文件夹
|
106
|
-
continue
|
107
|
-
|
108
|
-
db_name = None # 初始化/重置变量,避免进入下一个循环
|
109
|
-
collection_name = None
|
110
|
-
for data in datas: # 根据标题对照表适配 db_name 和 collection_name
|
111
|
-
if data['关键词1'] in name and data['关键词2'] in name:
|
112
|
-
db_name = data['数据库名']
|
113
|
-
collection_name = data['数据表']
|
114
|
-
# print(name, db_name, collection_name)
|
115
|
-
# return
|
116
|
-
|
117
|
-
# 只针对 csv, xlsx 文件进行处理
|
118
|
-
if not name.endswith('.csv') and not name.endswith('.xls') and not name.endswith('.xlsx'):
|
119
|
-
continue
|
120
|
-
df = pd.DataFrame() # 初始化 df
|
121
|
-
encoding = self.get_encoding(file_path=os.path.join(root, name)) # 用于处理 csv 文件
|
122
|
-
tg_names = [
|
123
|
-
# '账户报表', # 旧版,后来改成 营销场景报表了,C 店还是旧版
|
124
|
-
'营销场景报表',
|
125
|
-
'计划报表',
|
126
|
-
'单元报表',
|
127
|
-
'关键词报表',
|
128
|
-
'人群报表',
|
129
|
-
'主体报表',
|
130
|
-
'其他主体报表',
|
131
|
-
'创意报表',
|
132
|
-
'地域报表',
|
133
|
-
'权益报表',
|
134
|
-
]
|
135
|
-
for tg_name in tg_names:
|
136
|
-
if tg_name in name and '报表汇总' not in name and name.endswith('.csv'): # 排除达摩盘报表: 人群报表汇总
|
137
|
-
pattern = re.findall(r'(.*_)\d{8}_\d{6}', name)
|
138
|
-
if not pattern: # 说明已经转换过
|
139
|
-
continue
|
140
|
-
shop_name = re.findall(r'\d{8}_\d{6}_(.*)\W', name)
|
141
|
-
if shop_name:
|
142
|
-
shop_name = shop_name[0]
|
143
|
-
else:
|
144
|
-
shop_name = ''
|
145
|
-
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
146
|
-
if '地域' not in name: # 除了地域报表, 检查数据的字段是否包含“场景名字”,如果没有,说明没有选“pbix” 模块
|
147
|
-
ck = df.columns.tolist()
|
148
|
-
if '场景名字' not in ck:
|
149
|
-
print(f'1.2.0 {name} 报表字段缺失, 请选择Pbix数据模板下载')
|
150
|
-
check_remove_file = True
|
151
|
-
continue
|
152
|
-
if len(df) == 0:
|
153
|
-
print(f'1.3.0 {name} 报表是空的, 请重新下载')
|
154
|
-
check_remove_file = True
|
155
|
-
continue
|
156
|
-
cols = df.columns.tolist()
|
157
|
-
if '日期' not in cols:
|
158
|
-
print(f'1.4.0 {name} 报表不包含分日数据, 已跳过')
|
159
|
-
check_remove_file = True
|
160
|
-
continue
|
161
|
-
if '省' in cols:
|
162
|
-
if '市' not in cols:
|
163
|
-
print(f'1.5.0 {name} 请下载市级地域报表,而不是省报表')
|
164
|
-
check_remove_file = True
|
165
|
-
continue
|
166
|
-
# df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
|
167
|
-
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
168
|
-
# df.fillna(0, inplace=True)
|
169
|
-
if '省' in df.columns.tolist() and '场景名字' in df.columns.tolist() and '地域报表' in name:
|
170
|
-
if shop_name == '广东万里马':
|
171
|
-
db_name = '推广数据_淘宝店'
|
172
|
-
else:
|
173
|
-
db_name = '推广数据2'
|
174
|
-
collection_name = f'完整_{tg_name}'
|
175
|
-
else:
|
176
|
-
if shop_name == '广东万里马':
|
177
|
-
db_name = '推广数据_淘宝店'
|
178
|
-
else:
|
179
|
-
db_name = '推广数据2'
|
180
|
-
collection_name = f'{tg_name}'
|
181
|
-
check_remove_file = True
|
182
|
-
if name.endswith('.csv') and '超级直播' in name:
|
183
|
-
# 超级直播
|
184
|
-
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
185
|
-
if len(df) == 0:
|
186
|
-
print(f'{name} 报表数据为空')
|
187
|
-
check_remove_file = True
|
188
|
-
continue
|
189
|
-
pattern = re.findall(r'(.*_)\d{8}_\d{6}', name)
|
190
|
-
if not pattern: # 说明已经转换过
|
191
|
-
continue
|
192
|
-
shop_name = re.findall(r'\d{8}_\d{6}_(.*)\W', name)
|
193
|
-
if shop_name:
|
194
|
-
shop_name = shop_name[0]
|
195
|
-
else:
|
196
|
-
shop_name = ''
|
197
|
-
# df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
|
198
|
-
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
199
|
-
if shop_name == '广东万里马':
|
200
|
-
db_name = '推广数据_淘宝店'
|
201
|
-
check_remove_file = True
|
202
|
-
elif name.endswith('.xls') and '短直联投' in name:
|
203
|
-
# 短直联投
|
204
|
-
df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
|
205
|
-
df = pd.concat(df)
|
206
|
-
if len(df) == 0:
|
207
|
-
print(f'{name} 报表数据为空')
|
208
|
-
check_remove_file = True
|
209
|
-
continue
|
210
|
-
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
211
|
-
check_remove_file = True
|
212
|
-
elif name.endswith('.xls') and '视频加速推广' in name:
|
213
|
-
# 超级短视频
|
214
|
-
df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
|
215
|
-
df = pd.concat(df)
|
216
|
-
if len(df) == 0:
|
217
|
-
print(f'{name} 报表数据为空')
|
218
|
-
check_remove_file = True
|
219
|
-
continue
|
220
|
-
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
221
|
-
check_remove_file = True
|
222
|
-
if '人群报表汇总' in name:
|
223
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=1, na_filter=False)
|
224
|
-
if len(df) == 0:
|
225
|
-
print(f'{name} 报表数据为空')
|
226
|
-
check_remove_file = True
|
227
|
-
continue
|
228
|
-
check_remove_file = True
|
229
|
-
# ----------------- 推广报表 分割线 -----------------
|
230
|
-
# ----------------- 推广报表 分割线 -----------------
|
231
|
-
date01 = re.findall(r'(\d{4}-\d{2}-\d{2})_\d{4}-\d{2}-\d{2}', str(name))
|
232
|
-
date02 = re.findall(r'\d{4}-\d{2}-\d{2}_(\d{4}-\d{2}-\d{2})', str(name))
|
233
|
-
attrib_pattern = re.findall(r'(\d+).xlsx', name) # 天猫商品素材表格, 必不可少
|
234
|
-
if name.endswith('.xls') and '生意参谋' in name and '无线店铺流量来源' in name:
|
235
|
-
# 无线店铺流量来源
|
236
|
-
df = pd.read_excel(os.path.join(root, name), header=5)
|
237
|
-
if len(df) == 0:
|
238
|
-
print(f'{name} 报表数据为空')
|
239
|
-
check_remove_file = True
|
240
|
-
continue
|
241
|
-
# df.replace(to_replace=['-'], value=0, regex=False, inplace=True)
|
242
|
-
# df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
243
|
-
if date01[0] != date02[0]:
|
244
|
-
data_lis = date01[0] + '_' + date02[0]
|
245
|
-
df.insert(loc=0, column='数据周期', value=data_lis)
|
246
|
-
df.insert(loc=0, column='日期', value=date01[0])
|
247
|
-
# 2024-2-19 官方更新了推广渠道来源名称
|
248
|
-
df['三级来源'] = df['三级来源'].apply(
|
249
|
-
lambda x: '精准人群推广' if x == '精准人群推广(原引力魔方)'
|
250
|
-
else '关键词推广' if x == '关键词推广(原直通车)'
|
251
|
-
else '智能场景' if x == '智能场景(原万相台)'
|
252
|
-
else x
|
253
|
-
)
|
254
|
-
db_name = '生意参谋2'
|
255
|
-
if '经营优势' in df['一级来源'].tolist(): # 新版流量
|
256
|
-
if '数据周期' in df.columns.tolist():
|
257
|
-
collection_name='店铺来源_月数据'
|
258
|
-
else:
|
259
|
-
collection_name='店铺来源_日数据'
|
260
|
-
else: # 旧版流量
|
261
|
-
if '数据周期' in df.columns.tolist():
|
262
|
-
collection_name='店铺来源_月数据_旧版'
|
263
|
-
else:
|
264
|
-
collection_name='店铺来源_日数据_旧版'
|
265
|
-
check_remove_file = True
|
266
|
-
elif name.endswith('.csv') and '客户运营平台_客户列表' in name:
|
267
|
-
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
268
|
-
check_remove_file = True
|
269
|
-
elif name.endswith('.xlsx') and '直播分场次效果' in name:
|
270
|
-
pattern = re.findall(r'(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})', name)
|
271
|
-
if pattern:
|
272
|
-
check_remove_file = True
|
273
|
-
continue
|
274
|
-
df = pd.read_excel(os.path.join(root, name), header=0)
|
275
|
-
if len(df) == 0:
|
276
|
-
print(f'{name} 报表数据为空')
|
277
|
-
check_remove_file = True
|
278
|
-
continue
|
279
|
-
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
280
|
-
df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
281
|
-
df['直播开播时间'] = pd.to_datetime(df['直播开播时间'], format='%Y-%m-%d %H:%M:%S', errors='ignore')
|
282
|
-
df.insert(loc=0, column='日期', value=df['直播开播时间'])
|
283
|
-
df['日期'] = df['日期'].apply(
|
284
|
-
lambda x: pd.to_datetime(str(x).split(' ')[0], format='%Y-%m-%d', errors='ignore') if x else x)
|
285
|
-
df.insert(loc=1, column='店铺', value='万里马官方旗舰店')
|
286
|
-
check_remove_file = True
|
287
|
-
|
288
|
-
elif name.endswith('.xls') and '生意参谋' in name and '无线店铺三级流量来源详情' in name:
|
289
|
-
# 店铺来源,手淘搜索,关键词
|
290
|
-
pattern = re.findall(r'(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})', name)
|
291
|
-
df = pd.read_excel(os.path.join(root, name), header=5)
|
292
|
-
if len(df) == 0:
|
293
|
-
print(f'{name} 报表数据为空')
|
294
|
-
check_remove_file = True
|
295
|
-
os.remove(os.path.join(root, name))
|
296
|
-
continue
|
297
|
-
df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
298
|
-
df.insert(loc=0, column='日期', value=pattern[0][1])
|
299
|
-
df.rename(columns={
|
300
|
-
'来源名称': '关键词',
|
301
|
-
'收藏商品-支付买家数': '收藏商品_支付买家数',
|
302
|
-
'加购商品-支付买家数': '加购商品_支付买家数',
|
303
|
-
}, inplace=True)
|
304
|
-
if pattern[0][0] != pattern[0][1]:
|
305
|
-
data_lis = pattern[0][0] + '_' + pattern[0][1]
|
306
|
-
df.insert(loc=1, column='数据周期', value=data_lis)
|
307
|
-
check_remove_file = True
|
308
|
-
|
309
|
-
elif name.endswith('.xls') and '生意参谋' in name and '商品_全部' in name:
|
310
|
-
# 店铺商品排行
|
311
|
-
df = pd.read_excel(os.path.join(root, name), header=4)
|
312
|
-
if len(df) == 0:
|
313
|
-
print(f'{name} 报表数据为空')
|
314
|
-
check_remove_file = True
|
315
|
-
continue
|
316
|
-
# df.replace(to_replace=['-'], value=0, regex=False, inplace=True)
|
317
|
-
# df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
318
|
-
df.rename(columns={'统计日期': '日期', '商品ID': '商品id'}, inplace=True)
|
319
|
-
if date01[0] != date02[0]:
|
320
|
-
data_lis = date01[0] + '_' + date02[0]
|
321
|
-
df.insert(loc=1, column='数据周期', value=data_lis)
|
322
|
-
check_remove_file = True
|
323
|
-
elif name.endswith('.xls') and '参谋店铺整体日报' in name:
|
324
|
-
# 自助取数,店铺日报
|
325
|
-
df = pd.read_excel(os.path.join(root, name), header=7)
|
326
|
-
if len(df) == 0:
|
327
|
-
print(f'{name} 报表数据为空')
|
328
|
-
check_remove_file = True
|
329
|
-
continue
|
330
|
-
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
331
|
-
check_remove_file = True
|
332
|
-
elif name.endswith('.xls') and '参谋每日流量_自助取数_新版' in name:
|
333
|
-
# 自助取数,每日流量
|
334
|
-
df = pd.read_excel(os.path.join(root, name), header=7)
|
335
|
-
if len(df) == 0:
|
336
|
-
print(f'{name} 报表数据为空')
|
337
|
-
check_remove_file = True
|
338
|
-
continue
|
339
|
-
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
340
|
-
# 2024-2-19 官方更新了推广渠道来源名称,自助取数没有更新,这里强制更改
|
341
|
-
df['三级来源'] = df['三级来源'].apply(
|
342
|
-
lambda x: '精准人群推广' if x == '引力魔方'
|
343
|
-
else '关键词推广' if x == '直通车'
|
344
|
-
else '智能场景' if x == '万相台'
|
345
|
-
else '精准人群推广' if x == '精准人群推广(原引力魔方)'
|
346
|
-
else '关键词推广' if x == '关键词推广(原直通车)'
|
347
|
-
else '智能场景' if x == '智能场景(原万相台)'
|
348
|
-
else x
|
349
|
-
)
|
350
|
-
check_remove_file = True
|
351
|
-
elif name.endswith('.xls') and '商品sku' in name:
|
352
|
-
# 自助取数,商品sku
|
353
|
-
df = pd.read_excel(os.path.join(root, name), header=7)
|
354
|
-
if len(df) == 0:
|
355
|
-
print(f'{name} 报表数据为空')
|
356
|
-
check_remove_file = True
|
357
|
-
continue
|
358
|
-
df.rename(columns={
|
359
|
-
'统计日期': '日期',
|
360
|
-
'商品ID': '商品id',
|
361
|
-
'SKU ID': 'sku id',
|
362
|
-
'商品SKU': '商品sku',
|
363
|
-
}, inplace=True)
|
364
|
-
check_remove_file = True
|
365
|
-
elif name.endswith('.xls') and '参谋店铺流量来源(月)' in name:
|
366
|
-
# 自助取数,月店铺流量来源
|
367
|
-
df = pd.read_excel(os.path.join(root, name), header=7)
|
368
|
-
if len(df) == 0:
|
369
|
-
print(f'{name} 报表数据为空')
|
370
|
-
check_remove_file = True
|
371
|
-
continue
|
372
|
-
df.rename(columns={'统计日期': '数据周期'}, inplace=True)
|
373
|
-
# 2024-2-19 官方更新了推广渠道来源名称,自助取数没有更新,这里强制更改
|
374
|
-
df['三级来源'] = df['三级来源'].apply(
|
375
|
-
lambda x: '精准人群推广' if x == '引力魔方'
|
376
|
-
else '关键词推广' if x == '直通车'
|
377
|
-
else '智能场景' if x == '万相台'
|
378
|
-
else '精准人群推广' if x == '精准人群推广(原引力魔方)'
|
379
|
-
else '关键词推广' if x == '关键词推广(原直通车)'
|
380
|
-
else '智能场景' if x == '智能场景(原万相台)'
|
381
|
-
else x
|
382
|
-
)
|
383
|
-
df['日期'] = df['数据周期'].apply(lambda x: re.findall('(.*) ~', x)[0])
|
384
|
-
check_remove_file = True
|
385
|
-
elif name.endswith('.csv') and '分天数据-计划_活动类型-推广概览-数据汇总' in name:
|
386
|
-
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
387
|
-
df['日期'].replace(to_replace=['\\t'], value='', regex=True, inplace=True)
|
388
|
-
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
|
389
|
-
# min_clm = str(df['日期'].min()).split(' ')[0]
|
390
|
-
# max_clm = str(df['日期'].max()).split(' ')[0]
|
391
|
-
# new_name = f'淘宝联盟_分天数据_计划_活动类型_推广概览_数据汇总_{min_clm}_{max_clm}'
|
392
|
-
check_remove_file = True
|
393
|
-
elif name.endswith('.csv') and 'baobei' in name:
|
394
|
-
# 生意经宝贝指标日数据
|
395
|
-
date = re.findall(r's-(\d{4})(\d{2})(\d{2})\.', str(name))
|
396
|
-
if not date: # 阻止月数据及已转换的表格
|
397
|
-
print(f'{name} 不支持或是已转换的表格')
|
398
|
-
os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
399
|
-
check_remove_file = True
|
400
|
-
continue
|
401
|
-
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
402
|
-
if len(df) == 0:
|
403
|
-
print(f'{name} 报表数据为空')
|
404
|
-
os.remove(os.path.join(root, name))
|
405
|
-
check_remove_file = True
|
406
|
-
continue
|
407
|
-
if '日期' in df.columns.tolist():
|
408
|
-
df.pop('日期')
|
409
|
-
new_date = '-'.join(date[0])
|
410
|
-
df.insert(loc=0, column='日期', value=new_date)
|
411
|
-
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
412
|
-
check_remove_file = True
|
413
|
-
elif name.endswith('.csv') and '店铺销售指标' in name:
|
414
|
-
# 生意经, 店铺指标,仅限月数据,实际日指标也可以
|
415
|
-
name_st = re.findall(r'(.*)\(分日', name)
|
416
|
-
if not name_st:
|
417
|
-
print(f'{name} 已转换的表格')
|
418
|
-
check_remove_file = True
|
419
|
-
continue
|
420
|
-
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
421
|
-
if len(df) == 0:
|
422
|
-
print(f'{name} 报表数据为空')
|
423
|
-
check_remove_file = True
|
424
|
-
continue
|
425
|
-
df['日期'] = df['日期'].astype(str).apply(
|
426
|
-
lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', x)[0]) if x else x)
|
427
|
-
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
428
|
-
check_remove_file = True
|
429
|
-
elif name.endswith('csv') and '省份城市分析' in name:
|
430
|
-
# 生意经,地域分布, 仅限日数据
|
431
|
-
pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})\.', name)
|
432
|
-
if not pattern or '省份城市分析2' not in name:
|
433
|
-
print(f'{name} 不支持或已转换的表格')
|
434
|
-
os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
435
|
-
check_remove_file = True
|
436
|
-
continue
|
437
|
-
date = '-'.join(pattern[0][1:])
|
438
|
-
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
439
|
-
if len(df) == 0:
|
440
|
-
print(f'{name} 报表数据为空')
|
441
|
-
check_remove_file = True
|
442
|
-
os.remove(os.path.join(root, name))
|
443
|
-
continue
|
444
|
-
df['省'] = df['省份'].apply(lambda x: x if ' ├─ ' not in x and ' └─ ' not in x else None)
|
445
|
-
df['城市'] = df[['省份', '省']].apply(lambda x: '汇总' if x['省'] else x['省份'], axis=1)
|
446
|
-
df['省'].fillna(method='ffill', inplace=True)
|
447
|
-
df['城市'].replace(to_replace=[' ├─ | └─ '], value='', regex=True, inplace=True)
|
448
|
-
pov = df.pop('省')
|
449
|
-
city = df.pop('城市')
|
450
|
-
df.insert(loc=1, column='城市', value=city)
|
451
|
-
df.insert(loc=0, column='日期', value=date)
|
452
|
-
df['省份'] = pov
|
453
|
-
df['省+市'] = df[['省份', '城市']].apply(lambda x: f'{x["省份"]}-{x["城市"]}', axis=1)
|
454
|
-
df.replace('NAN', 0, inplace=True)
|
455
|
-
df['笔单价'] = df.apply(lambda x: 0 if x['销售量'] == 0 else 0 if x['销售量'] == '0' else x['笔单价'], axis=1)
|
456
|
-
check_remove_file = True
|
457
|
-
elif name.endswith('csv') and 'order' in name:
|
458
|
-
# 生意经,订单数据,仅限月数据
|
459
|
-
pattern = re.findall(r'(.*)(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})', name)
|
460
|
-
if not pattern:
|
461
|
-
print(f'{name} 不支持或已转换的表格')
|
462
|
-
# os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
463
|
-
check_remove_file = True
|
464
|
-
continue
|
465
|
-
date1 = pattern[0][1:4]
|
466
|
-
date1 = '-'.join(date1)
|
467
|
-
date2 = pattern[0][4:]
|
468
|
-
date2 = '-'.join(date2)
|
469
|
-
date = f'{date1}_{date2}'
|
470
|
-
df = pd.read_csv(os.path.join(root, name), encoding='gb18030', header=0, na_filter=False)
|
471
|
-
if len(df) == 0:
|
472
|
-
print(f'{name} 报表数据为空')
|
473
|
-
check_remove_file = True
|
474
|
-
continue
|
475
|
-
df.insert(loc=0, column='日期', value=date1)
|
476
|
-
df.insert(loc=1, column='数据周期', value=date)
|
477
|
-
df['商品id'] = df['宝贝链接'].apply(
|
478
|
-
lambda x: re.sub('.*id=', '', x) if x else x)
|
479
|
-
df.rename(columns={'宝贝标题': '商品标题', '宝贝链接': '商品链接'}, inplace=True)
|
480
|
-
df['颜色编码'] = df['商家编码'].apply(
|
481
|
-
lambda x: ''.join(re.findall(r' .*(\d{4})$', str(x))) if x else x)
|
482
|
-
check_remove_file = True
|
483
|
-
elif name.endswith('.xlsx') and '直播间成交订单明细' in name:
|
484
|
-
# 直播间成交订单明细
|
485
|
-
df = pd.read_excel(os.path.join(root, name), header=0)
|
486
|
-
if len(df) == 0:
|
487
|
-
print(f'{name} 报表数据为空')
|
488
|
-
check_remove_file = True
|
489
|
-
continue
|
490
|
-
df.rename(columns={'场次ID': '场次id', '商品ID': '商品id'}, inplace=True)
|
491
|
-
df['日期'] = df['支付时间'].apply(lambda x: x.strftime('%Y-%m-%d'))
|
492
|
-
check_remove_file = True
|
493
|
-
elif name.endswith('.xlsx') and '直播间大盘数据' in name:
|
494
|
-
# 直播间大盘数据
|
495
|
-
df = pd.read_excel(os.path.join(root, name), header=0)
|
496
|
-
if len(df) == 0:
|
497
|
-
print(f'{name} 报表数据为空')
|
498
|
-
check_remove_file = True
|
499
|
-
continue
|
500
|
-
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
501
|
-
check_remove_file = True
|
502
|
-
elif name.endswith('.xls') and '直播业绩-成交拆解' in name:
|
503
|
-
# 直播业绩-成交拆解
|
504
|
-
df = pd.read_excel(os.path.join(root, name), header=5)
|
505
|
-
if len(df) == 0:
|
506
|
-
print(f'{name} 报表数据为空')
|
507
|
-
check_remove_file = True
|
508
|
-
continue
|
509
|
-
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
510
|
-
check_remove_file = True
|
511
|
-
elif name.endswith('.csv') and '淘宝店铺数据' in name:
|
512
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
513
|
-
check_remove_file = True
|
514
|
-
elif name.endswith('.csv') and '人群洞察' in name:
|
515
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
516
|
-
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
517
|
-
df = df[(df['人群规模'] != '') & (df['人群规模'] != '--')]
|
518
|
-
if len(df) == 0:
|
519
|
-
try:
|
520
|
-
os.remove(os.path.join(root, name)) # 移除原文件
|
521
|
-
except:
|
522
|
-
pass
|
523
|
-
continue
|
524
|
-
if is_move:
|
525
|
-
try:
|
526
|
-
os.remove(os.path.join(root, name)) # 是否移除原文件
|
527
|
-
check_remove_file = True
|
528
|
-
except Exception as e:
|
529
|
-
print(f'{name}, {e}')
|
530
|
-
elif name.endswith('.csv') and '客户_客户概况_画像' in name:
|
531
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
532
|
-
check_remove_file = True
|
533
|
-
elif name.endswith('.csv') and '市场排行_店铺' in name:
|
534
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
535
|
-
check_remove_file = True
|
536
|
-
elif name.endswith('.csv') and '类目洞察_属性分析_分析明细_商品发现' in name:
|
537
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
538
|
-
check_remove_file = True
|
539
|
-
elif name.endswith('.csv') and '类目洞察_属性分析_分析明细_汇总' in name:
|
540
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
541
|
-
check_remove_file = True
|
542
|
-
elif name.endswith('.csv') and '类目洞察_价格分析_分析明细_商品发现' in name:
|
543
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
544
|
-
check_remove_file = True
|
545
|
-
elif name.endswith('.csv') and '类目洞察_价格分析_分析明细_汇总' in name:
|
546
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
547
|
-
check_remove_file = True
|
548
|
-
elif name.endswith('.csv') and '搜索排行_搜索' in name:
|
549
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
550
|
-
check_remove_file = True
|
551
|
-
elif name.endswith('.csv') and '竞店分析-销售分析-关键指标对比' in name:
|
552
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
553
|
-
check_remove_file = True
|
554
|
-
elif name.endswith('.csv') and '竞店分析-销售分析-top商品榜' in name:
|
555
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
556
|
-
check_remove_file = True
|
557
|
-
elif name.endswith('.csv') and '竞店分析-来源分析-入店来源' in name:
|
558
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
559
|
-
check_remove_file = True
|
560
|
-
elif name.endswith('.csv') and '竞店分析-来源分析-入店搜索词' in name:
|
561
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
562
|
-
check_remove_file = True
|
563
|
-
elif name.endswith('.csv') and '爱库存_商品榜单_spu_' in name:
|
564
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
565
|
-
check_remove_file = True
|
566
|
-
# ----------------------- 京东数据处理分界线 -----------------------
|
567
|
-
# ----------------------- 京东数据处理分界线 -----------------------
|
568
|
-
elif name.endswith('.xlsx') and '店铺来源_流量来源' in name:
|
569
|
-
# 京东店铺来源
|
570
|
-
if '按天' not in name:
|
571
|
-
print(f'{name} 京东流量请按天下载')
|
572
|
-
check_remove_file = True
|
573
|
-
continue
|
574
|
-
date01 = re.findall(r'(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
|
575
|
-
new_date01 = f'{date01[0][0]}-{date01[0][1]}-{date01[0][2]}'
|
576
|
-
new_date02 = f'{date01[0][3]}-{date01[0][4]}-{date01[0][5]}'
|
577
|
-
new_date03 = f'{new_date01}_{new_date02}'
|
578
|
-
df = pd.read_excel(os.path.join(root, name), header=0)
|
579
|
-
if len(df) == 0:
|
580
|
-
print(f'{name} 报表数据为空')
|
581
|
-
check_remove_file = True
|
582
|
-
continue
|
583
|
-
df.insert(loc=0, column='日期', value=new_date01)
|
584
|
-
if new_date01 != new_date02:
|
585
|
-
df.insert(loc=1, column='数据周期', value=new_date03)
|
586
|
-
cols = df.columns.tolist()
|
587
|
-
for col_2024 in cols: # 京东这个表有字段加了去年日期,删除这些同比数据字段,不然列数量爆炸
|
588
|
-
if '20' in col_2024 and '流量来源' in name:
|
589
|
-
df.drop(col_2024, axis=1, inplace=True)
|
590
|
-
check_remove_file = True
|
591
|
-
elif name.endswith('.xlsx') and '全部渠道_商品明细' in name:
|
592
|
-
# 京东商品明细 文件转换
|
593
|
-
date1 = re.findall(r'_(\d{4})(\d{2})(\d{2})_全部', str(name))
|
594
|
-
if not date1[0]:
|
595
|
-
print(f'{name}: 仅支持日数据')
|
596
|
-
check_remove_file = True
|
597
|
-
continue
|
598
|
-
if date1:
|
599
|
-
date1 = f'{date1[0][0]}-{date1[0][1]}-{date1[0][2]}'
|
600
|
-
df = pd.read_excel(os.path.join(root, name), header=0)
|
601
|
-
if len(df) == 0:
|
602
|
-
print(f'{name} 报表数据为空')
|
603
|
-
check_remove_file = True
|
604
|
-
continue
|
605
|
-
if '10035975359247' in df['商品ID'].values or '10056642622343' in df['商品ID'].values:
|
606
|
-
new_name = f'sku_{date1}_全部渠道_商品明细.csv'
|
607
|
-
elif '10021440233518' in df['商品ID'].values or '10022867813485' in df['商品ID'].values:
|
608
|
-
new_name = f'spu_{date1}_全部渠道_商品明细.csv'
|
609
|
-
else:
|
610
|
-
new_name = f'未分类_{date1}_全部渠道_商品明细.csv'
|
611
|
-
df.rename(columns={'商品ID': '商品id'}, inplace=True)
|
612
|
-
df.insert(loc=0, column='日期', value=date1)
|
613
|
-
df['最近上架时间'].loc[0] = df['最近上架时间'].loc[1] # 填充这一列, 避免上传 mysql 日期类型报错
|
614
|
-
if 'sku' in new_name: # 即使有文件对照表,也不能删除这个条件,spu ,sku 是后来加的
|
615
|
-
db_name = '京东数据2'
|
616
|
-
collection_name = 'sku_商品明细'
|
617
|
-
elif 'spu' in new_name:
|
618
|
-
db_name = '京东数据2'
|
619
|
-
collection_name = 'spu_商品明细'
|
620
|
-
check_remove_file = True
|
621
|
-
elif name.endswith('.xlsx') and '搜索分析-排名定位-商品词下排名' in name:
|
622
|
-
# 京东商品词下排名
|
623
|
-
try:
|
624
|
-
pattern = re.findall(r'(\d{4}-\d{2}-\d{2})-(\d{4}-\d{2}-\d{2})', name)
|
625
|
-
if not pattern:
|
626
|
-
check_remove_file = True
|
627
|
-
continue
|
628
|
-
if pattern[0][0] == pattern[0][1]:
|
629
|
-
print(f'{name}: 检测到数据周期异常,仅支持7天数据')
|
630
|
-
check_remove_file = True
|
631
|
-
continue
|
632
|
-
df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
|
633
|
-
if len(df) == 0:
|
634
|
-
print(f'{name} 报表数据为空')
|
635
|
-
check_remove_file = True
|
636
|
-
continue
|
637
|
-
if len(df.columns.tolist()) < 20:
|
638
|
-
print(f'{name}: 报表可能缺失诊断数据')
|
639
|
-
os.remove(os.path.join(root, name))
|
640
|
-
check_remove_file = True
|
641
|
-
continue
|
642
|
-
df.rename(columns={'商品的ID': 'skuid'}, inplace=True)
|
643
|
-
for col in ['词人气', '搜索点击率']:
|
644
|
-
if col in df.columns.tolist():
|
645
|
-
df[col] = df[col].apply(lambda x: round(x, 6) if x else x)
|
646
|
-
check_remove_file = True
|
647
|
-
except Exception as e:
|
648
|
-
print(e)
|
649
|
-
print(name, '报错')
|
650
|
-
os.remove(os.path.join(root, name))
|
651
|
-
check_remove_file = True
|
652
|
-
continue
|
653
|
-
elif name.endswith('.xlsx') and '搜索分析-排名定位-商品排名' in name:
|
654
|
-
# 京东商品排名
|
655
|
-
date_in = re.findall(r'(\d{4}-\d{2}-\d{2})-搜索', str(name))[0]
|
656
|
-
df = pd.read_excel(os.path.join(root, name), header=0)
|
657
|
-
if len(df) == 0:
|
658
|
-
print(f'{name} 报表数据为空')
|
659
|
-
check_remove_file = True
|
660
|
-
continue
|
661
|
-
df.insert(0, '日期', date_in) # 插入新列
|
662
|
-
df.rename(columns={'SKU': 'skuid'}, inplace=True)
|
663
|
-
if '点击率' in df.columns.tolist():
|
664
|
-
df['点击率'] = df['点击率'].apply(lambda x: round(x, 6) if x else x)
|
665
|
-
check_remove_file = True
|
666
|
-
elif name.endswith('.xls') and '竞店概况_竞店详情' in name:
|
667
|
-
# 京东,竞争-竞店概况-竞店详情-全部渠道
|
668
|
-
date01 = re.findall(r'全部渠道_(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
|
669
|
-
start_date = f'{date01[0][0]}-{date01[0][1]}-{date01[0][2]}'
|
670
|
-
# end_date = f'{date01[0][3]}-{date01[0][4]}-{date01[0][5]}'
|
671
|
-
df = pd.read_excel(os.path.join(root, name), header=0)
|
672
|
-
if len(df) == 0:
|
673
|
-
print(f'{name} 报表数据为空')
|
674
|
-
check_remove_file = True
|
675
|
-
continue
|
676
|
-
df.insert(loc=0, column='日期', value=start_date)
|
677
|
-
check_remove_file = True
|
678
|
-
elif name.endswith('.xls') and ('JD店铺日报_店铺' in name or '店铺_20' in name):
|
679
|
-
# 京东 自助报表 店铺日报
|
680
|
-
df = pd.read_excel(os.path.join(root, name), header=0)
|
681
|
-
if len(df) == 0:
|
682
|
-
print(f'{name} 报表数据为空')
|
683
|
-
check_remove_file = True
|
684
|
-
continue
|
685
|
-
if '访客数-全部渠道' not in df.columns.tolist(): # 识别是否真的京东日报
|
686
|
-
continue
|
687
|
-
df['日期'] = df['日期'].apply(
|
688
|
-
lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', str(x))[0])
|
689
|
-
)
|
690
|
-
check_remove_file = True
|
691
|
-
elif name.endswith('.xls') and '商家榜单_女包_整体' in name:
|
692
|
-
# 京东 行业 商家榜单
|
693
|
-
date2 = re.findall(r'_\d{8}-\d+', name)
|
694
|
-
if date2:
|
695
|
-
print(f'{name}: 请下载日数据,不支持其他周期')
|
696
|
-
# os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
697
|
-
check_remove_file = True
|
698
|
-
continue
|
699
|
-
df = pd.read_excel(os.path.join(root, name), header=0)
|
700
|
-
if len(df) == 0:
|
701
|
-
print(f'{name} 报表数据为空')
|
702
|
-
check_remove_file = True
|
703
|
-
continue
|
704
|
-
df['日期'] = df['日期'].astype(str).apply(lambda x: f'{x[:4]}-{x[4:6]}-{x[6:8]}')
|
705
|
-
df.insert(loc=0, column='类型', value='商家榜单')
|
706
|
-
check_remove_file = True
|
707
|
-
elif name.endswith('.xlsx') and '批量SKU导出-批量任务' in name:
|
708
|
-
# 京东 sku 导出
|
709
|
-
df = pd.read_excel(os.path.join(root, name), header=0)
|
710
|
-
if len(df) == 0:
|
711
|
-
print(f'{name} 报表数据为空')
|
712
|
-
check_remove_file = True
|
713
|
-
continue
|
714
|
-
d_time = datetime.datetime.today().strftime('%Y-%m-%d')
|
715
|
-
df.insert(loc=0, column='日期', value=d_time)
|
716
|
-
df['商品链接'] = df['商品链接'].apply(lambda x: f'https://{x}' if x else x)
|
717
|
-
check_remove_file = True
|
718
|
-
elif name.endswith('.xlsx') and '批量SPU导出-批量任务' in name:
|
719
|
-
# 京东 spu 导出
|
720
|
-
df = pd.read_excel(os.path.join(root, name), header=0)
|
721
|
-
if len(df) == 0:
|
722
|
-
print(f'{name} 报表数据为空')
|
723
|
-
check_remove_file = True
|
724
|
-
continue
|
725
|
-
d_time = datetime.datetime.today().strftime('%Y-%m-%d')
|
726
|
-
df.insert(loc=0, column='日期', value=d_time)
|
727
|
-
check_remove_file = True
|
728
|
-
elif name.endswith('.csv') and '万里马箱包推广1_完整点击成交' in name:
|
729
|
-
# 京东推广数据
|
730
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
731
|
-
if len(df) == 0:
|
732
|
-
print(f'{name} 报表数据为空')
|
733
|
-
check_remove_file = True
|
734
|
-
continue
|
735
|
-
df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
|
736
|
-
check_remove_file = True
|
737
|
-
elif name.endswith('.csv') and '万里马箱包推广1_京东推广搜索词_pbix同步不要' in name:
|
738
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
739
|
-
if len(df) == 0:
|
740
|
-
print(f'{name} 报表数据为空')
|
741
|
-
check_remove_file = True
|
742
|
-
continue
|
743
|
-
df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
|
744
|
-
df['是否品牌词'] = df['搜索词'].str.contains('万里马|wanlima', regex=True)
|
745
|
-
df['是否品牌词'] = df['是否品牌词'].apply(lambda x: '品牌词' if x else '')
|
746
|
-
check_remove_file = True
|
747
|
-
elif name.endswith('.xlsx') and '零售明细统计' in name:
|
748
|
-
df = pd.read_excel(os.path.join(root, name), header=0)
|
749
|
-
if len(df) == 0:
|
750
|
-
print(f'{name} 报表数据为空')
|
751
|
-
check_remove_file = True
|
752
|
-
continue
|
753
|
-
df = df[df['缩略图'] != '合计']
|
754
|
-
check_remove_file = True
|
755
|
-
elif name.endswith('.csv') and '营销概况_全站营销' in name:
|
756
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=1, na_filter=False)
|
757
|
-
df = df[(df['日期'] != '日期') & (df['日期'] != '汇总') & (df['日期'] != '0') & (df['花费'] != '0') & (df['花费'] != '0.00')]
|
758
|
-
df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
|
759
|
-
df.drop("'当前时间'", axis=1, inplace=True)
|
760
|
-
df.rename(columns={'全站ROI': '全站roi'}, inplace=True)
|
761
|
-
df.insert(loc=1, column='产品线', value='全站营销')
|
762
|
-
check_remove_file = True
|
763
|
-
elif name.endswith('.csv') and '关键词点击成交报表_pbix同步_勿删改' in name:
|
764
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
765
|
-
for col in df.columns.tolist():
|
766
|
-
if '(' in col:
|
767
|
-
new_col = re.sub('[()]', '_', col)
|
768
|
-
new_col = new_col.strip('_')
|
769
|
-
df.rename(columns={col: new_col}, inplace=True)
|
770
|
-
df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
|
771
|
-
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
|
772
|
-
# min_clm = str(df['日期'].min()).split(' ')[0]
|
773
|
-
# max_clm = str(df['日期'].max()).split(' ')[0]
|
774
|
-
check_remove_file = True
|
775
|
-
|
776
|
-
# 商品素材,必须保持放在最后处理
|
777
|
-
elif name.endswith('xlsx'):
|
778
|
-
"""从天猫商品素材库中下载的文件,将文件修改日期添加到DF 和文件名中"""
|
779
|
-
if attrib_pattern:
|
780
|
-
df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
|
781
|
-
cols = df.columns.tolist()
|
782
|
-
if '商品白底图' in cols and '方版场景图' in cols:
|
783
|
-
f_info = os.stat(os.path.join(root, name)) # 读取文件的 stat 信息
|
784
|
-
mtime = time.strftime('%Y-%m-%d', time.localtime(f_info.st_mtime)) # 读取文件创建日期
|
785
|
-
df['日期'] = mtime
|
786
|
-
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
|
787
|
-
df.rename(columns={'商品ID': '商品id'}, inplace=True)
|
788
|
-
sp_id = df['商品id'].tolist()
|
789
|
-
if 652737455554 in sp_id or 683449516249 in sp_id or 37114359548 in sp_id or 570735930393 in sp_id:
|
790
|
-
df.insert(0, '店铺名称', '万里马官方旗舰店') # 插入新列
|
791
|
-
elif 704624764420 in sp_id or 701781021639 in sp_id or 520380314717 in sp_id:
|
792
|
-
df.insert(0, '店铺名称', '万里马官方企业店') # 插入新列
|
793
|
-
else:
|
794
|
-
df.insert(0, '店铺名称', 'coome旗舰店') # 插入新列
|
795
|
-
db_name = '属性设置2'
|
796
|
-
collection_name = '商品素材导出'
|
797
|
-
else:
|
798
|
-
df = pd.DataFrame()
|
799
|
-
check_remove_file = True
|
800
|
-
|
801
|
-
if is_move and check_remove_file:
|
802
|
-
try:
|
803
|
-
os.remove(os.path.join(root, name)) # 是否移除原文件
|
804
|
-
except Exception as e:
|
805
|
-
print(f'{name}, {e}')
|
806
|
-
if len(df) > 0:
|
807
|
-
if not db_name or not collection_name:
|
808
|
-
print(f'从本地csv文件中,根据文件标题匹配数据库名和数据表,结果存在空值,db_name: {db_name}, collection_name: {collection_name}')
|
809
|
-
# else:
|
810
|
-
# 将数据传入 self.datas 等待更新进数据库
|
811
|
-
self.datas.append(
|
812
|
-
{
|
813
|
-
'数据库名': db_name,
|
814
|
-
'集合名称': collection_name,
|
815
|
-
'数据主体': df,
|
816
|
-
'文件名': name,
|
817
|
-
}
|
818
|
-
)
|
819
|
-
|
820
|
-
# 品销宝一个表格里面包含多个 sheet, 最好是单独处理
|
821
|
-
for root, dirs, files in os.walk(self.path, topdown=False):
|
822
|
-
for name in files:
|
823
|
-
if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
|
824
|
-
continue
|
825
|
-
is_continue = False
|
826
|
-
if is_except:
|
827
|
-
for item in is_except:
|
828
|
-
if item in os.path.join(root, name):
|
829
|
-
# print(name)
|
830
|
-
is_continue = True
|
831
|
-
break
|
832
|
-
if is_continue: # 需要排除不做处理的文件或文件夹
|
833
|
-
continue
|
834
|
-
db_name = None # 初始化/重置变量,避免进入下一个循环
|
835
|
-
collection_name = None
|
836
|
-
for data in datas: # 根据标题对照表适配 db_name 和 collection_name
|
837
|
-
if data['关键词1'] in name and data['关键词2'] in name:
|
838
|
-
db_name = data['数据库名']
|
839
|
-
collection_name = data['数据表']
|
840
|
-
|
841
|
-
# df = pd.DataFrame()
|
842
|
-
if name.endswith('.xlsx') and '明星店铺' in name:
|
843
|
-
# 品销宝
|
844
|
-
pattern = re.findall(r'_(\d{4}-\d{2}-\d{2})_', name)
|
845
|
-
if pattern:
|
846
|
-
continue
|
847
|
-
sheets4 = ['账户', '推广计划', '推广单元', '创意', '品牌流量包', '定向人群'] # 品销宝
|
848
|
-
file_name4 = os.path.splitext(name)[0] # 明星店铺报表
|
849
|
-
for sheet4 in sheets4:
|
850
|
-
df = pd.read_excel(os.path.join(root, name), sheet_name=sheet4, header=0, engine='openpyxl')
|
851
|
-
df = df[df['搜索量'] > 0]
|
852
|
-
if len(df) < 1:
|
853
|
-
# print(f'{name}/{sheet4} 跳过')
|
854
|
-
continue
|
855
|
-
df.insert(loc=1, column='报表类型', value=sheet4)
|
856
|
-
if not db_name or not collection_name:
|
857
|
-
print(
|
858
|
-
f'从本地csv文件中,根据文件标题匹配数据库名和数据表,结果存在空值,db_name: {db_name}, collection_name: {collection_name}')
|
859
|
-
else:
|
860
|
-
# 将数据传入 self.datas 等待更新进数据库
|
861
|
-
self.datas.append(
|
862
|
-
{
|
863
|
-
'数据库名': db_name,
|
864
|
-
'集合名称': collection_name,
|
865
|
-
'数据主体': df,
|
866
|
-
'文件名': name,
|
867
|
-
}
|
868
|
-
)
|
869
|
-
if is_move:
|
870
|
-
os.remove(os.path.join(root, name))
|
871
|
-
|
872
|
-
# df = self.date_table() # 创建一个日期表
|
873
|
-
# self.datas.append(
|
874
|
-
# {
|
875
|
-
# '数据库名': '聚合数据',
|
876
|
-
# '集合名称': '日期表',
|
877
|
-
# '数据主体': df,
|
878
|
-
# '文件名': '日期表文件名',
|
879
|
-
# }
|
880
|
-
# )
|
881
|
-
|
882
|
-
def upload_df(self, service_databases=[{}], path=None):
|
883
|
-
"""
|
884
|
-
将清洗后的 df 上传数据库, copysh.py 调用
|
885
|
-
"""
|
886
|
-
df_to_json = df_types.DataTypes() # json 文件, 包含数据的 dtypes 信息
|
887
|
-
for service_database in service_databases:
|
888
|
-
for service_name, database in service_database.items():
|
889
|
-
# print(service_name, database)
|
890
|
-
if database == 'mongodb':
|
891
|
-
d = mongo.UploadMongo(
|
892
|
-
username=username,
|
893
|
-
password=password,
|
894
|
-
host=host,
|
895
|
-
port=port,
|
896
|
-
drop_duplicates=False,
|
897
|
-
)
|
898
|
-
for data in self.datas:
|
899
|
-
db_name, collection_name, df = data['数据库名'], data['集合名称'], data['数据主体']
|
900
|
-
df_to_json.get_df_types(
|
901
|
-
df=df,
|
902
|
-
db_name=db_name,
|
903
|
-
collection_name=collection_name,
|
904
|
-
is_file_dtype=True, # 默认本地文件优先: True
|
905
|
-
)
|
906
|
-
d.df_to_mongo(df=df, db_name=db_name, collection_name=collection_name)
|
907
|
-
if d.client:
|
908
|
-
d.client.close()
|
909
|
-
|
910
|
-
elif database == 'mysql':
|
911
|
-
m = mysql.MysqlUpload(
|
912
|
-
username=username,
|
913
|
-
password=password,
|
914
|
-
host=host,
|
915
|
-
port=port,
|
916
|
-
)
|
917
|
-
for data in self.datas:
|
918
|
-
df, db_name, collection_name, rt_filename = data['数据主体'], data['数据库名'], data['集合名称'], data['文件名']
|
919
|
-
df_to_json.get_df_types(
|
920
|
-
df=df,
|
921
|
-
db_name=db_name,
|
922
|
-
collection_name=collection_name,
|
923
|
-
is_file_dtype=True, # 默认本地文件优先: True
|
924
|
-
)
|
925
|
-
m.df_to_mysql(
|
926
|
-
df=df,
|
927
|
-
db_name=db_name,
|
928
|
-
table_name=collection_name,
|
929
|
-
move_insert=True, # 先删除,再插入
|
930
|
-
df_sql=False, # 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重
|
931
|
-
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
932
|
-
count=None,
|
933
|
-
filename=rt_filename, # 用来追踪处理进度
|
934
|
-
)
|
935
|
-
df_to_json.as_json_file() # 写入 json 文件, 包含数据的 dtypes 信息
|
936
|
-
|
937
|
-
def new_unzip(self, path=None, is_move=None):
|
938
|
-
"""
|
939
|
-
{解压并移除zip文件}
|
940
|
-
如果是京东的商品明细,处理过程:
|
941
|
-
1. 读取 zip包的文件名
|
942
|
-
2. 组合完整路径,判断文件夹下是否已经有同名文件
|
943
|
-
3. 如果有,则将该同名文件改名,(从文件名中提取日期,重新拼接文件名)
|
944
|
-
4. 然后解压 zip包
|
945
|
-
5. 需要用 _jd_rename 继续重命名刚解压的文件
|
946
|
-
is_move 参数, 是否移除 下载目录的所有zip 文件
|
947
|
-
"""
|
948
|
-
if not path:
|
949
|
-
path = self.path
|
950
|
-
res_names = [] # 需要移除的压缩文件
|
951
|
-
for root, dirs, files in os.walk(path, topdown=False):
|
952
|
-
for name in files:
|
953
|
-
if '~$' in name or 'DS_Store' in name or 'baidu' in name or 'xunlei' in name:
|
954
|
-
continue
|
955
|
-
if name.endswith('.zip'):
|
956
|
-
old_file = os.path.join(root, name)
|
957
|
-
f = zipfile.ZipFile(old_file, 'r')
|
958
|
-
if len(f.namelist()) == 1: # 压缩包只有一个文件的情况
|
959
|
-
for zip_name in f.namelist(): # 读取zip内的文件名称
|
960
|
-
# zip_name_1 = zip_name.encode('cp437').decode('utf-8')
|
961
|
-
try:
|
962
|
-
zip_name_1 = zip_name.encode('utf-8').decode('utf-8')
|
963
|
-
except:
|
964
|
-
zip_name_1 = zip_name.encode('cp437').decode('utf-8')
|
965
|
-
new_path = os.path.join(root, zip_name_1) # 拼接解压后的文件路径
|
966
|
-
if os.path.isfile(new_path) and '全部渠道_商品明细' in new_path: # 是否存在和包内同名的文件
|
967
|
-
# 专门处理京东文件
|
968
|
-
df = pd.read_excel(new_path)
|
969
|
-
try:
|
970
|
-
pattern1 = re.findall(r'\d{8}_(\d{4})(\d{2})(\d{2})_全部渠道_商品明细',
|
971
|
-
name)
|
972
|
-
pattern2 = re.findall(
|
973
|
-
r'\d{8}_(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})_全部渠道_商品明细',
|
974
|
-
name)
|
975
|
-
if pattern1:
|
976
|
-
year_date = '-'.join(list(pattern1[0])) + '_' + '-'.join(list(pattern1[0]))
|
977
|
-
elif pattern2:
|
978
|
-
year_date = '-'.join(list(pattern2[0])[0:3]) + '_' + '-'.join(
|
979
|
-
list(pattern2[0])[3:7])
|
980
|
-
else:
|
981
|
-
year_date = '无法提取日期'
|
982
|
-
print(f'{name} 无法从文件名中提取日期,请检查pattern或文件')
|
983
|
-
if ('10035975359247' in df['商品ID'].values or '10056642622343' in
|
984
|
-
df['商品ID'].values):
|
985
|
-
os.rename(new_path,
|
986
|
-
os.path.join(root, 'sku_' + year_date + '_全部渠道_商品明细.xls'))
|
987
|
-
f.extract(zip_name_1, root)
|
988
|
-
elif ('10021440233518' in df['商品ID'].values or '10022867813485' in
|
989
|
-
df['商品ID'].values):
|
990
|
-
os.rename(new_path,
|
991
|
-
os.path.join(root, 'spu_' + year_date + '_全部渠道_商品明细.xls'))
|
992
|
-
f.extract(zip_name_1, root)
|
993
|
-
if is_move:
|
994
|
-
os.remove(os.path.join(root, name))
|
995
|
-
except Exception as e:
|
996
|
-
print(e)
|
997
|
-
continue
|
998
|
-
else:
|
999
|
-
f.extract(zip_name, root)
|
1000
|
-
if zip_name_1 != zip_name:
|
1001
|
-
os.rename(os.path.join(root, zip_name), os.path.join(root, zip_name_1))
|
1002
|
-
if is_move:
|
1003
|
-
res_names.append(name)
|
1004
|
-
# os.remove(os.path.join(root, name)) # 这里不能移除,会提示文件被占用
|
1005
|
-
f.close()
|
1006
|
-
else: # 压缩包内包含多个文件的情况
|
1007
|
-
f.close()
|
1008
|
-
self.unzip_all(path=old_file, save_path=path)
|
1009
|
-
|
1010
|
-
if is_move:
|
1011
|
-
for name in res_names:
|
1012
|
-
os.remove(os.path.join(path, name))
|
1013
|
-
print(f'移除{os.path.join(path, name)}')
|
1014
|
-
|
1015
|
-
def unzip_all(self, path, save_path):
|
1016
|
-
"""
|
1017
|
-
遍历目录, 重命名有乱码的文件
|
1018
|
-
2. 如果压缩包是文件夹, 则保存到新文件夹,并删除有乱码的文件夹
|
1019
|
-
3. 删除MAC系统的临时文件夹__MACOSX
|
1020
|
-
"""
|
1021
|
-
with PyZipFile(path) as _f:
|
1022
|
-
_f.extractall(save_path)
|
1023
|
-
_f.close()
|
1024
|
-
for _root, _dirs, _files in os.walk(save_path, topdown=False):
|
1025
|
-
for _name in _files:
|
1026
|
-
if '~$' in _name or 'DS_Store' in _name:
|
1027
|
-
continue
|
1028
|
-
try:
|
1029
|
-
_new_root = _root.encode('cp437').decode('utf-8')
|
1030
|
-
_new_name = _name.encode('cp437').decode('utf-8')
|
1031
|
-
except:
|
1032
|
-
_new_root = _root.encode('utf-8').decode('utf-8')
|
1033
|
-
_new_name = _name.encode('utf-8').decode('utf-8')
|
1034
|
-
_old = os.path.join(_root, _name)
|
1035
|
-
_new = os.path.join(_new_root, _new_name)
|
1036
|
-
if _new_root != _root: # 目录乱码,创建新目录
|
1037
|
-
os.makedirs(_new_root, exist_ok=True)
|
1038
|
-
os.rename(_old, _new)
|
1039
|
-
try:
|
1040
|
-
_new_root = _root.encode('cp437').decode('utf-8')
|
1041
|
-
except:
|
1042
|
-
_new_root = _root.encode('utf-8').decode('utf-8')
|
1043
|
-
if _new_root != _root or '__MACOSX' in _root:
|
1044
|
-
shutil.rmtree(_root)
|
1045
|
-
|
1046
|
-
def get_encoding(self, file_path):
|
1047
|
-
"""
|
1048
|
-
获取文件的编码方式, 读取速度比较慢,非必要不要使用
|
1049
|
-
"""
|
1050
|
-
with open(file_path, 'rb') as f:
|
1051
|
-
f1 = f.read()
|
1052
|
-
encod = chardet.detect(f1).get('encoding')
|
1053
|
-
return encod
|
1054
|
-
|
1055
|
-
def date_table(self, service_databases=[{}]):
|
1056
|
-
"""
|
1057
|
-
生成 pbix使用的日期表
|
1058
|
-
"""
|
1059
|
-
yesterday = time.strftime('%Y-%m-%d', time.localtime(time.time() - 86400))
|
1060
|
-
dic = pd.date_range(start=self.start_date, end=yesterday)
|
1061
|
-
df = pd.DataFrame(dic, columns=['日期'])
|
1062
|
-
df.sort_values('日期', ascending=True, ignore_index=True, inplace=True)
|
1063
|
-
df.reset_index(inplace=True)
|
1064
|
-
# inplace 添加索引到 df
|
1065
|
-
p = df.pop('index')
|
1066
|
-
df['月2'] = df['日期']
|
1067
|
-
df['月2'] = df['月2'].dt.month
|
1068
|
-
df['日期'] = df['日期'].dt.date # 日期格式保留年月日,去掉时分秒
|
1069
|
-
df['年'] = df['日期'].apply(lambda x: str(x).split('-')[0] + '年')
|
1070
|
-
df['月'] = df['月2'].apply(lambda x: str(x) + '月')
|
1071
|
-
# df.drop('月2', axis=1, inplace=True)
|
1072
|
-
mon = df.pop('月2')
|
1073
|
-
df['日'] = df['日期'].apply(lambda x: str(x).split('-')[2])
|
1074
|
-
df['年月'] = df.apply(lambda x: x['年'] + x['月'], axis=1)
|
1075
|
-
df['月日'] = df.apply(lambda x: x['月'] + x['日'] + '日', axis=1)
|
1076
|
-
df['第n周'] = df['日期'].apply(lambda x: x.strftime('第%W周'))
|
1077
|
-
df['索引'] = p
|
1078
|
-
df['月索引'] = mon
|
1079
|
-
df.sort_values('日期', ascending=False, ignore_index=True, inplace=True)
|
1080
|
-
|
1081
|
-
for service_database in service_databases:
|
1082
|
-
for service_name, database in service_database.items():
|
1083
|
-
m = mysql.MysqlUpload(
|
1084
|
-
username=username,
|
1085
|
-
password=password,
|
1086
|
-
host=host,
|
1087
|
-
port=port,
|
1088
|
-
)
|
1089
|
-
m.df_to_mysql(
|
1090
|
-
df=df,
|
1091
|
-
db_name='聚合数据',
|
1092
|
-
table_name='日期表',
|
1093
|
-
move_insert=True, # 先删除,再插入
|
1094
|
-
df_sql=False, # 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重
|
1095
|
-
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
1096
|
-
count=None,
|
1097
|
-
filename=None, # 用来追踪处理进度
|
1098
|
-
)
|
1099
|
-
# return df
|
1100
|
-
|
1101
|
-
def other_table(self, service_databases=[{'home_lx': 'mysql'}]):
|
1102
|
-
""" 上传 support 文件夹下的 主推货品 """
|
1103
|
-
support_file = set_support.SetSupport(dirname='support').dirname
|
1104
|
-
filename = '主推货品.xlsx'
|
1105
|
-
if not os.path.isfile(os.path.join(support_file, filename)):
|
1106
|
-
return
|
1107
|
-
# df = pd.read_csv(os.path.join(support_file, filename), encoding='utf-8_sig', header=0, na_filter=False)
|
1108
|
-
df = pd.read_excel(os.path.join(support_file, filename), header=0)
|
1109
|
-
for col in df.columns.tolist():
|
1110
|
-
if '预算' in col:
|
1111
|
-
df.rename(columns={col: '预算占比'}, inplace=True)
|
1112
|
-
df = df[['商品id', '商家编码', '预算占比']]
|
1113
|
-
df['日期'] = datetime.datetime.now().strftime('%Y-%m-%d')
|
1114
|
-
for service_database in service_databases:
|
1115
|
-
for service_name, database in service_database.items():
|
1116
|
-
m = mysql.MysqlUpload(
|
1117
|
-
username=username,
|
1118
|
-
password=password,
|
1119
|
-
host=host,
|
1120
|
-
port=port,
|
1121
|
-
)
|
1122
|
-
m.df_to_mysql(
|
1123
|
-
df=df,
|
1124
|
-
db_name='属性设置2',
|
1125
|
-
table_name='主推货品',
|
1126
|
-
move_insert=False, # 先删除,再插入
|
1127
|
-
df_sql=False, # 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重
|
1128
|
-
drop_duplicates=True, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
1129
|
-
count=None,
|
1130
|
-
filename=None, # 用来追踪处理进度
|
1131
|
-
)
|
1132
|
-
|
1133
|
-
|
1134
|
-
def upload_dir(path, db_name, collection_name, json_path=None):
|
1135
|
-
""" 上传一个文件夹到 mysql 或者 mongodb 数据库 """
|
1136
|
-
if not os.path.isdir(path):
|
1137
|
-
print(f'{os.path.splitext(os.path.basename(__file__))[0]}.upload_dir: 函数只接受文件夹路径,不是一个文件夹: {path}')
|
1138
|
-
return
|
1139
|
-
|
1140
|
-
m = mysql.MysqlUpload(username=username, password=password, host=host, port=port)
|
1141
|
-
|
1142
|
-
# 从本地 json 文件从读取 df 的数据类型信息
|
1143
|
-
df_to_json = df_types.DataTypes()
|
1144
|
-
dtypes = df_to_json.load_dtypes(
|
1145
|
-
db_name=db_name,
|
1146
|
-
collection_name=collection_name,
|
1147
|
-
)
|
1148
|
-
|
1149
|
-
count = 0
|
1150
|
-
for root, dirs, files in os.walk(path, topdown=False):
|
1151
|
-
for name in files:
|
1152
|
-
count += 1
|
1153
|
-
i = 1 # 用来统计当前处理文件进度
|
1154
|
-
for root, dirs, files in os.walk(path, topdown=False):
|
1155
|
-
for name in files:
|
1156
|
-
if '~$' in name or '.DS' in name or '.localized' in name or 'baidu' in name:
|
1157
|
-
i += 1
|
1158
|
-
continue
|
1159
|
-
if name.endswith('.csv'):
|
1160
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
1161
|
-
if name.endswith('.xlsx'):
|
1162
|
-
df = pd.read_excel(os.path.join(root, name), sheet_name=0, header=0, engine='openpyxl')
|
1163
|
-
# try:
|
1164
|
-
if len(df) == 0:
|
1165
|
-
continue
|
1166
|
-
# if '新版' not in name:
|
1167
|
-
# continue
|
1168
|
-
# cv = converter.DataFrameConverter()
|
1169
|
-
# df = cv.convert_df_cols(df=df) # 清理列名和 df 中的非法字符
|
1170
|
-
|
1171
|
-
# try:
|
1172
|
-
# df = df.astype(dtypes) # 按本地文件更新 df 的数据类型, 可能因为字段不同产生异常
|
1173
|
-
# except Exception as e:
|
1174
|
-
# print(name, e)
|
1175
|
-
# # 如果发生异常,这将 df 的数据和 json 中的数据取交集
|
1176
|
-
# old_dt = df.dtypes.apply(str).to_dict() # 将 dataframe 数据类型转为字典形式
|
1177
|
-
# intersection_keys = dtypes.keys() & old_dt.keys() # 获取两个字典键的交集
|
1178
|
-
# dtypes = {k: dtypes[k] for k in intersection_keys} # 使用交集的键创建新字典
|
1179
|
-
# df = df.astype(dtypes) # 再次更新 df 的数据类型
|
1180
|
-
df.fillna(0, inplace=True)
|
1181
|
-
for col in df.columns.tolist():
|
1182
|
-
df[col] = df[col].apply(lambda x: 0 if str(x) == '' else x)
|
1183
|
-
|
1184
|
-
if '更新时间' not in df.columns.tolist():
|
1185
|
-
df['更新时间'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
1186
|
-
|
1187
|
-
# set_typ = {
|
1188
|
-
# '日期': 'date',
|
1189
|
-
# '店铺名称': 'varchar(100)',
|
1190
|
-
# 'spu_id': 'varchar(100)',
|
1191
|
-
# '图片': 'varchar(255)',
|
1192
|
-
# '序号': 'smallint',
|
1193
|
-
# '商品名称': 'varchar(255)',
|
1194
|
-
# '商品款号': 'varchar(255)',
|
1195
|
-
# '一级类目名称': 'varchar(255)',
|
1196
|
-
# '二级类目名称': 'varchar(255)',
|
1197
|
-
# '三级类目名称': 'varchar(255)',
|
1198
|
-
# '数据更新时间': 'timestamp',
|
1199
|
-
# '更新时间': 'timestamp',
|
1200
|
-
# }
|
1201
|
-
# new_dict = {
|
1202
|
-
# '日期': '',
|
1203
|
-
# '店铺名称': '',
|
1204
|
-
# '序号': '',
|
1205
|
-
# '商品名称': '',
|
1206
|
-
# 'spu_id': '',
|
1207
|
-
# '商品款号': '',
|
1208
|
-
# '一级类目名称': '',
|
1209
|
-
# '二级类目名称': '',
|
1210
|
-
# '三级类目名称': '',
|
1211
|
-
# '访客量': '',
|
1212
|
-
# '浏览量': '',
|
1213
|
-
# '下单gmv': '',
|
1214
|
-
# '成交gmv': '',
|
1215
|
-
# '支付人数_成交': '',
|
1216
|
-
# }
|
1217
|
-
# for dict_data in df.to_dict(orient='records'):
|
1218
|
-
# new_dict.update(dict_data)
|
1219
|
-
# m.dict_to_mysql(
|
1220
|
-
# db_name=db_name,
|
1221
|
-
# table_name=collection_name,
|
1222
|
-
# dict_data=new_dict,
|
1223
|
-
# # icm_update=['日期', '店铺名称', 'spu_id', '商品款号'],
|
1224
|
-
# unique_main_key=None,
|
1225
|
-
# set_typ=set_typ,
|
1226
|
-
# )
|
1227
|
-
m.df_to_mysql(df=df, db_name=db_name, table_name=collection_name,
|
1228
|
-
move_insert=False, # 先删除,再插入
|
1229
|
-
df_sql = True,
|
1230
|
-
drop_duplicates=False,
|
1231
|
-
count=f'{i}/{count}',
|
1232
|
-
filename=name,
|
1233
|
-
set_typ={},
|
1234
|
-
)
|
1235
|
-
# nas.df_to_mysql(df=df, db_name=db_name, table_name=collection_name, drop_duplicates=True,)
|
1236
|
-
|
1237
|
-
i += 1
|
1238
|
-
|
1239
|
-
|
1240
|
-
def one_file_to_mysql(file, db_name, table_name):
|
1241
|
-
""" 上传单个文件到 mysql 数据库 file 参数是一个文件 """
|
1242
|
-
if not os.path.isfile(file):
|
1243
|
-
print(f'{os.path.splitext(os.path.basename(__file__))[0]}.one_file_to_mysql: 函数只接受文件, 此文件不存在: {file}')
|
1244
|
-
return
|
1245
|
-
filename = os.path.basename(file)
|
1246
|
-
if file.endswith('.xlsx'):
|
1247
|
-
df = pd.read_excel(file)
|
1248
|
-
else:
|
1249
|
-
encod = get_encoding(file)
|
1250
|
-
df = pd.read_csv(file, encoding=encod, header=0, na_filter=False, float_precision='high')
|
1251
|
-
# df.replace(to_replace=[','], value='', regex=True, inplace=True) # 替换掉特殊字符
|
1252
|
-
m = mysql.MysqlUpload(username=username, password=password, host=host, port=port)
|
1253
|
-
# df.pop('id')
|
1254
|
-
m.df_to_mysql(
|
1255
|
-
df=df,
|
1256
|
-
db_name=db_name,
|
1257
|
-
table_name=table_name,
|
1258
|
-
# icm_update=['sku_id'], # 增量更新, 在聚合数据中使用,其他不要用
|
1259
|
-
move_insert=True, # 先删除,再插入
|
1260
|
-
df_sql=False, # 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重
|
1261
|
-
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
1262
|
-
count=None,
|
1263
|
-
filename=None, # 用来追踪处理进度
|
1264
|
-
reset_id=True, # 是否重置自增列
|
1265
|
-
# set_typ=set_typ,
|
1266
|
-
)
|
1267
|
-
|
1268
|
-
|
1269
|
-
def cut_as_year_month(as_month=False):
|
1270
|
-
"""
|
1271
|
-
将表格数据按年划分
|
1272
|
-
as_month: true 按月重新保存
|
1273
|
-
"""
|
1274
|
-
file_name = '达摩盘_人群报表'
|
1275
|
-
path = r'/Users/xigua/Downloads/数据库导出'
|
1276
|
-
|
1277
|
-
for root, dirs, files in os.walk(path, topdown=False):
|
1278
|
-
for name in files:
|
1279
|
-
if name.endswith('.csv') and 'baidu' not in name and '~' not in name:
|
1280
|
-
pattern = re.findall(r'\d{4}.csv|\d{4}-\d{2}.csv', name)
|
1281
|
-
if pattern:
|
1282
|
-
continue
|
1283
|
-
if file_name not in name:
|
1284
|
-
continue
|
1285
|
-
# df = pd.read_excel(os.path.join(root, name), header=0)
|
1286
|
-
df_before = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
1287
|
-
df_before['日期'] = pd.to_datetime(df_before['日期'], format='%Y-%m-%d', errors='ignore')
|
1288
|
-
max_date = df_before['日期'].max(skipna=True).year
|
1289
|
-
min_date = df_before['日期'].min(skipna=True).year
|
1290
|
-
for year in range(min_date, max_date+1):
|
1291
|
-
df = df_before[(df_before['日期'] >= f'{year}-01-01') & (df_before['日期'] <= f'{year}-12-31')]
|
1292
|
-
if as_month:
|
1293
|
-
for month in range(1, 13):
|
1294
|
-
if month < 10:
|
1295
|
-
month = f'0{month}'
|
1296
|
-
for n in range(31, 27, -1):
|
1297
|
-
try:
|
1298
|
-
end_day = pd.to_datetime(f'{year}-{month}-{n}')
|
1299
|
-
break
|
1300
|
-
except:
|
1301
|
-
continue
|
1302
|
-
st_day = pd.to_datetime(f'{year}-{month}-01')
|
1303
|
-
df_month = df[(df['日期'] >= st_day) & (df['日期'] <= end_day)]
|
1304
|
-
if len(df_month) == 0:
|
1305
|
-
continue
|
1306
|
-
df_month.sort_values('日期', ascending=True, ignore_index=True, inplace=True)
|
1307
|
-
df_month = df_month.reset_index(drop=True)
|
1308
|
-
df_month = df_month.reset_index(drop=False)
|
1309
|
-
df_month.pop('id')
|
1310
|
-
df_month.rename(columns={'index': 'id'}, inplace=True)
|
1311
|
-
df_month['id'] = df_month['id'].apply(lambda x: x + 1)
|
1312
|
-
new_name = f'{os.path.splitext(name)[0]}_{year}_{month}.csv'
|
1313
|
-
print(new_name)
|
1314
|
-
df_month.to_csv(os.path.join(root, new_name), encoding='utf-8_sig', index=False, header=True)
|
1315
|
-
else:
|
1316
|
-
df.sort_values('日期', ascending=True, ignore_index=True, inplace=True)
|
1317
|
-
df = df.reset_index(drop=True)
|
1318
|
-
df = df.reset_index(drop=False)
|
1319
|
-
df.pop('id')
|
1320
|
-
df.rename(columns={'index': 'id'}, inplace=True)
|
1321
|
-
df['id'] = df['id'].apply(lambda x: x + 1)
|
1322
|
-
new_name = f'{os.path.splitext(name)[0]}_{year}.csv'
|
1323
|
-
print(new_name)
|
1324
|
-
df.to_csv(os.path.join(root, new_name), encoding='utf-8_sig', index=False, header=True)
|
1325
|
-
|
1326
|
-
|
1327
|
-
def doc_to_sql(write_data=False, read_data=False):
|
1328
|
-
if not write_data and not read_data:
|
1329
|
-
return
|
1330
|
-
# filename = '关于做好2024年世界互联网大会乌镇峰会期间寄递渠道安全保障工作的通知.pdf'
|
1331
|
-
path = '/Users/xigua/数据中心/微信pdf文件/2024-10'
|
1332
|
-
|
1333
|
-
if not os.path.isdir(path):
|
1334
|
-
print(f'不存在的文件夹: {path}')
|
1335
|
-
return
|
1336
|
-
m_engine = mysql.MysqlUpload(
|
1337
|
-
username=username,
|
1338
|
-
password=password,
|
1339
|
-
host=host,
|
1340
|
-
port=port,
|
1341
|
-
charset='utf8mb4'
|
1342
|
-
)
|
1343
|
-
if write_data:
|
1344
|
-
for root, dirs, files in os.walk(path, topdown=False):
|
1345
|
-
for name in files:
|
1346
|
-
if '~$' in name or '.DS' in name or '.localized' in name or 'baidu' in name:
|
1347
|
-
continue
|
1348
|
-
if name.endswith('.pdf') or name.endswith('.pptx'):
|
1349
|
-
file_size = os.stat(os.path.join(root, name)).st_size
|
1350
|
-
if file_size > 1024 * 1024 * 1024:
|
1351
|
-
file_size = file_size / 1024 / 1024 / 1024
|
1352
|
-
file_size = f'{file_size:.2f} GB'
|
1353
|
-
elif file_size > 1024 * 1024:
|
1354
|
-
file_size = file_size / 1024 / 1024
|
1355
|
-
file_size = f'{file_size:.2f} MB'
|
1356
|
-
else:
|
1357
|
-
file_size = file_size / 1024
|
1358
|
-
file_size = f'{file_size:.2f} KB'
|
1359
|
-
mod_time = os.path.getmtime(os.path.join(root, name))
|
1360
|
-
local_time = time.localtime(mod_time)
|
1361
|
-
mod_time_formatted = time.strftime('%Y-%m-%d %H:%M:%S', local_time)
|
1362
|
-
|
1363
|
-
# 读取PDF文件为二进制数据
|
1364
|
-
with open(os.path.join(root, name), 'rb') as file:
|
1365
|
-
pdf_data = file.read()
|
1366
|
-
dict_data = {
|
1367
|
-
'日期': datetime.datetime.today().strftime('%Y-%m-%d'),
|
1368
|
-
'数据来源': '微信',
|
1369
|
-
'文件名称': name,
|
1370
|
-
'文件大小': file_size,
|
1371
|
-
'修改时间': mod_time_formatted,
|
1372
|
-
'数据主体': pdf_data,
|
1373
|
-
'扩展名': os.path.splitext(name)[-1],
|
1374
|
-
'更新时间': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
1375
|
-
}
|
1376
|
-
set_typ = {
|
1377
|
-
'日期': 'date',
|
1378
|
-
'数据来源': 'varchar(100)',
|
1379
|
-
'文件名称': 'varchar(255)',
|
1380
|
-
'文件大小': 'varchar(20)',
|
1381
|
-
'修改时间': 'timestamp',
|
1382
|
-
'数据主体': 'longblob',
|
1383
|
-
'扩展名': 'varchar(50)',
|
1384
|
-
'更新时间': 'timestamp',
|
1385
|
-
}
|
1386
|
-
m_engine.doc_to_sql(
|
1387
|
-
db_name='pdf文件',
|
1388
|
-
table_name='微信pdf文件',
|
1389
|
-
remove_by_key=['文件名称'],
|
1390
|
-
dict_data=dict_data,
|
1391
|
-
set_typ=set_typ,
|
1392
|
-
allow_not_null=False,
|
1393
|
-
filename=name,
|
1394
|
-
reset_id=True,
|
1395
|
-
)
|
1396
|
-
if read_data:
|
1397
|
-
filename=''
|
1398
|
-
save_path = '/Users/xigua/Downloads'
|
1399
|
-
m_engine.read_doc_data(
|
1400
|
-
db_name='pdf文件',
|
1401
|
-
table_name='微信pdf文件',
|
1402
|
-
column='文件名称',
|
1403
|
-
filename=filename,
|
1404
|
-
save_path=save_path,
|
1405
|
-
)
|
1406
|
-
|
1407
|
-
if __name__ == '__main__':
|
1408
|
-
doc_to_sql(
|
1409
|
-
write_data=True,
|
1410
|
-
read_data=False,
|
1411
|
-
)
|
1412
|
-
# cut_as_year_month(as_month=False)
|
1413
|
-
|
1414
|
-
# username = 'root'
|
1415
|
-
# password = ''
|
1416
|
-
# host = ''
|
1417
|
-
# port = ''
|
1418
|
-
|
1419
|
-
# # 上传 1 个文件到数据库
|
1420
|
-
# one_file_to_mysql(
|
1421
|
-
# file=r'/Users/xigua/Downloads/日期表.csv',
|
1422
|
-
# db_name='聚合数据test',
|
1423
|
-
# table_name='日期表',
|
1424
|
-
# )
|
1425
|
-
|
1426
|
-
|
1427
|
-
# col = 1
|
1428
|
-
# if col:
|
1429
|
-
# # 上传一个目录到指定数据库
|
1430
|
-
# db_name = '爱库存2'
|
1431
|
-
# table_name = '商品spu榜单'
|
1432
|
-
# upload_dir(
|
1433
|
-
# path=r'/Users/xigua/Downloads/数据上传中心',
|
1434
|
-
# db_name=db_name,
|
1435
|
-
# collection_name=table_name,
|
1436
|
-
# )
|
1437
|
-
|
1438
|
-
|