mdbq 0.0.9__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/__init__.py +4 -0
- mdbq/aggregation/aggregation.py +1003 -0
- mdbq/aggregation/query_data.py +354 -0
- mdbq/config/get_myconf.py +4 -0
- mdbq/dataframe/converter.py +9 -5
- mdbq/mongo/mongo.py +40 -5
- mdbq/mysql/mysql.py +60 -14
- mdbq/mysql/s_query.py +3 -0
- {mdbq-0.0.9.dist-info → mdbq-0.1.1.dist-info}/METADATA +1 -1
- {mdbq-0.0.9.dist-info → mdbq-0.1.1.dist-info}/RECORD +12 -9
- {mdbq-0.0.9.dist-info → mdbq-0.1.1.dist-info}/WHEEL +0 -0
- {mdbq-0.0.9.dist-info → mdbq-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1003 @@
|
|
1
|
+
# -*- coding:utf-8 -*-
|
2
|
+
import warnings
|
3
|
+
import pandas as pd
|
4
|
+
import numpy as np
|
5
|
+
import chardet
|
6
|
+
import zipfile
|
7
|
+
from pandas.tseries.holiday import next_monday
|
8
|
+
from pyzipper import PyZipFile
|
9
|
+
import os
|
10
|
+
import platform
|
11
|
+
import json
|
12
|
+
from mdbq.mongo import mongo
|
13
|
+
from mdbq.mysql import mysql
|
14
|
+
from mdbq.config import get_myconf
|
15
|
+
from mdbq.dataframe import converter
|
16
|
+
import datetime
|
17
|
+
import time
|
18
|
+
import re
|
19
|
+
import shutil
|
20
|
+
import getpass
|
21
|
+
|
22
|
+
warnings.filterwarnings('ignore')
|
23
|
+
"""
|
24
|
+
1. DatabaseUpdate: 程序用于对爬虫下载的原始数据进行清洗并入库;
|
25
|
+
数据清洗主要包括对字段名的非法字符处理,对 df 中的非法值进行预处理;
|
26
|
+
数据入库时会较检并更新本地 json 文件的 dtypes 信息;
|
27
|
+
若 json 缺失 dtypes 信息, 可用 update_dtypte 先更新, 或者手动修改添加本地 json 信息;
|
28
|
+
2. DataTypes: 类用于将某个csv文件的 dtypes 信息存入本地 json 文件, 会调用 converter 对 df 预处理;
|
29
|
+
作用于完善某个数据库 dtypes 信息,可以使用本函数更新;
|
30
|
+
3. update_dtypte: 函数将一个 csv 文件的 dtypes 信息更新至本地 json 文件;
|
31
|
+
4. upload: 函数将一个文件夹上传至数据库;
|
32
|
+
如果本地 json 中确实这个数据库的 dtypes 信息, 请用 update_dtypte 更新 json 文件再执行数据上传;
|
33
|
+
"""
|
34
|
+
|
35
|
+
|
36
|
+
class DataTypes:
|
37
|
+
"""
|
38
|
+
将某表的列信息添加到 json 示例:
|
39
|
+
file = '/Users/xigua/Downloads/天猫直通车旧报表(未排重版本).csv'
|
40
|
+
df = pd.read_csv(file, encoding='utf-8_sig', header=0, na_filter=False)
|
41
|
+
d = DataTypes()
|
42
|
+
d.read_dtypes(
|
43
|
+
df=df,
|
44
|
+
db_name='天猫数据2',
|
45
|
+
collection_name='旧版报表',
|
46
|
+
is_file_dtype=False, # 关闭文件优先
|
47
|
+
)
|
48
|
+
d.dtypes_to_file()
|
49
|
+
"""
|
50
|
+
def __init__(self):
|
51
|
+
if platform.system() == 'Windows':
|
52
|
+
self.path = f'C:\\同步空间\\BaiduSyncdisk\\原始文件2'
|
53
|
+
elif platform.system() == 'Darwin':
|
54
|
+
self.path = f'/Users/{getpass.getuser()}/数据中心/原始文件2'
|
55
|
+
else:
|
56
|
+
self.path = 'Downloads/数据中心/原始文件2' # 不可用
|
57
|
+
if not os.path.exists(self.path):
|
58
|
+
os.mkdir(self.path)
|
59
|
+
self.json_file = os.path.join(self.path, 'data_types.json')
|
60
|
+
# self.datas = json.loads('{}') # 等待写入 json 文件的 dtypes 数据
|
61
|
+
self.datas = {'json统计': {'数据库量': 0, '集合数量': 0, '字段量': 0}}
|
62
|
+
self.json_before()
|
63
|
+
|
64
|
+
def json_before(self):
|
65
|
+
""" 本地 json 文件的 dtypes 信息, 初始化更新给 self.datas """
|
66
|
+
if os.path.isfile(self.json_file):
|
67
|
+
with open(self.json_file, 'r', encoding='utf-8_sig') as json_file:
|
68
|
+
json_ = json.load(json_file)
|
69
|
+
self.datas.update(json_)
|
70
|
+
|
71
|
+
def load_dtypes(self, db_name, collection_name, ):
|
72
|
+
return self.datas[db_name][collection_name]
|
73
|
+
|
74
|
+
|
75
|
+
def read_dtypes(self, db_name, collection_name, df=pd.DataFrame(), is_file_dtype=True):
|
76
|
+
"""
|
77
|
+
读取 df 的 dtypes, 并更新本地 json 文件
|
78
|
+
期间会 清理不合规的列名, 并对数据类型进行转换(尝试将 object 类型转为 int 或 float)
|
79
|
+
返回: df 的 dtypes, 后续使用示例: df = df.astype(dtypes, errors='ignore')
|
80
|
+
is_file_dtype=True: 默认情况下以旧 json 优先, 即允许手动指定 json 文件里面的数据类型
|
81
|
+
"""
|
82
|
+
if len(df) == 0:
|
83
|
+
return
|
84
|
+
cv = converter.DataFrameConverter()
|
85
|
+
df = cv.convert_df_cols(df=df) # 清理 dataframe 列名的不合规字符
|
86
|
+
dtypes = df.dtypes.apply(str).to_dict()
|
87
|
+
dtypes = {db_name: {collection_name: dtypes}}
|
88
|
+
|
89
|
+
if not self.datas: # 如果不存在本地 json 文件, 直接返回即可
|
90
|
+
self.datas.update(dtypes)
|
91
|
+
return self.datas[db_name][collection_name]
|
92
|
+
else: # 存在则读取,并更新 df 的 dtypes
|
93
|
+
if db_name in list(self.datas.keys()): # ['京东数据2', '天猫数据2', '生意参谋数据2', '生意经2']
|
94
|
+
if collection_name in list(self.datas[db_name].keys()):
|
95
|
+
if is_file_dtype: # 旧数据优先
|
96
|
+
# # 用 dtypes 更新, 允许手动指定 json 文件里面的数据类型
|
97
|
+
dtypes[db_name][collection_name].update(self.datas[db_name][collection_name])
|
98
|
+
# 将 dtypes 更新进去,使 self.datas 包含新旧信息
|
99
|
+
self.datas[db_name][collection_name].update(dtypes[db_name][collection_name])
|
100
|
+
else: # 新数据优先
|
101
|
+
self.datas[db_name][collection_name].update(dtypes[db_name][collection_name])
|
102
|
+
else:
|
103
|
+
if is_file_dtype: # 旧数据优先
|
104
|
+
dtypes[db_name].update(self.datas[db_name])
|
105
|
+
self.datas[db_name].update(dtypes[db_name])
|
106
|
+
else:
|
107
|
+
self.datas[db_name].update(dtypes[db_name])
|
108
|
+
else:
|
109
|
+
# dtypes.update(self.datas) # 可以注释掉, 因为旧数据 self.datas 是空的
|
110
|
+
self.datas.update(dtypes)
|
111
|
+
dbs = 0
|
112
|
+
collections = 0
|
113
|
+
cols = 0
|
114
|
+
# self.datas.pop('json统计')
|
115
|
+
for k, v in self.datas.items():
|
116
|
+
if k == 'json统计':
|
117
|
+
continue
|
118
|
+
dbs += 1
|
119
|
+
for d, j in v.items():
|
120
|
+
collections += 1
|
121
|
+
for t, p in j.items():
|
122
|
+
cols += 1
|
123
|
+
tips = {'json统计': {'数据库量': dbs, '集合数量': collections, '字段量': cols}}
|
124
|
+
self.datas.update(tips)
|
125
|
+
return self.datas[db_name][collection_name] # 返回 df 的 dtypes
|
126
|
+
|
127
|
+
def dtypes_to_file(self):
|
128
|
+
""" 保存为本地 json 文件 """
|
129
|
+
# print(self.datas)
|
130
|
+
with open(self.json_file, 'w', encoding='utf-8_sig') as json_file:
|
131
|
+
json.dump(self.datas, json_file, ensure_ascii=False, sort_keys=True, indent=4)
|
132
|
+
time.sleep(1)
|
133
|
+
|
134
|
+
|
135
|
+
class DatabaseUpdate:
|
136
|
+
def __init__(self, path):
|
137
|
+
self.path = path # 数据所在目录, 即: 下载文件夹
|
138
|
+
self.datas: list = [] # 带更新进数据库的数据集合
|
139
|
+
|
140
|
+
def cleaning(self, is_move=True):
|
141
|
+
"""
|
142
|
+
数据清洗, 返回包含 数据库名, 集合名称, 和 df 主体
|
143
|
+
"""
|
144
|
+
if not os.path.exists(self.path):
|
145
|
+
print(f'1.1.0 初始化时传入了不存在的目录: {self.path}')
|
146
|
+
return
|
147
|
+
|
148
|
+
json_data = DataTypes() # json 文件, 包含数据的 dtypes 信息
|
149
|
+
for root, dirs, files in os.walk(self.path, topdown=False):
|
150
|
+
for name in files:
|
151
|
+
if '~$' in name or '.DS' in name or '.localized' in name or '.ini' in name or '$RECYCLE.BIN' in name or 'Icon' in name:
|
152
|
+
continue
|
153
|
+
# 只针对 csv, xlsx 文件进行处理
|
154
|
+
if not name.endswith('.csv') and not name.endswith('.xls') and not name.endswith('.xlsx'):
|
155
|
+
continue
|
156
|
+
df = pd.DataFrame()
|
157
|
+
encoding = self.get_encoding(file_path=os.path.join(root, name)) # 用于处理 csv 文件
|
158
|
+
tg_names = ['账户报表', '计划报表', '单元报表', '关键词报表', '人群报表', '宝贝主体报表',
|
159
|
+
'其他主体报表',
|
160
|
+
'创意报表', '地域报表', '权益报表']
|
161
|
+
for tg_name in tg_names:
|
162
|
+
if tg_name in name and '报表汇总' not in name and name.endswith('.csv'): # 排除达摩盘报表: 人群报表汇总
|
163
|
+
pattern = re.findall(r'(.*_)\d{8}_\d{6}', name)
|
164
|
+
if not pattern: # 说明已经转换过
|
165
|
+
continue
|
166
|
+
shop_name = re.findall(r'\d{8}_\d{6}_(.*)\W', name)
|
167
|
+
if shop_name:
|
168
|
+
shop_name = shop_name[0]
|
169
|
+
else:
|
170
|
+
shop_name = ''
|
171
|
+
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
172
|
+
if '地域' not in name: # 除了地域报表, 检查数据的字段是否包含“场景名字”,如果没有,说明没有选“pbix” 模块
|
173
|
+
ck = df.columns.tolist()
|
174
|
+
if '场景名字' not in ck:
|
175
|
+
print(f'1.2.0 {name} 报表字段缺失, 请选择Pbix数据模板下载')
|
176
|
+
continue
|
177
|
+
if len(df) == 0:
|
178
|
+
print(f'1.3.0 {name} 报表是空的, 请重新下载')
|
179
|
+
continue
|
180
|
+
cols = df.columns.tolist()
|
181
|
+
if '日期' not in cols:
|
182
|
+
print(f'1.4.0 {name} 报表不包含分日数据, 已跳过')
|
183
|
+
continue
|
184
|
+
if '省' in cols:
|
185
|
+
if '市' not in cols:
|
186
|
+
print(f'1.5.0 {name} 请下载市级地域报表,而不是省报表')
|
187
|
+
continue
|
188
|
+
# df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
|
189
|
+
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
190
|
+
# df.fillna(0, inplace=True)
|
191
|
+
db_name = '天猫数据2'
|
192
|
+
collection_name = f'推广数据_{tg_name}'
|
193
|
+
if name.endswith('.csv') and '超级直播' in name:
|
194
|
+
# 超级直播
|
195
|
+
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
196
|
+
if len(df) == 0:
|
197
|
+
print(f'{name} 报表数据为空')
|
198
|
+
continue
|
199
|
+
pattern = re.findall(r'(.*_)\d{8}_\d{6}', name)
|
200
|
+
if not pattern: # 说明已经转换过
|
201
|
+
continue
|
202
|
+
shop_name = re.findall(r'\d{8}_\d{6}_(.*)\W', name)
|
203
|
+
if shop_name:
|
204
|
+
shop_name = shop_name[0]
|
205
|
+
else:
|
206
|
+
shop_name = ''
|
207
|
+
# df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
|
208
|
+
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
209
|
+
db_name = '天猫数据2'
|
210
|
+
collection_name = '推广数据_超级直播'
|
211
|
+
elif name.endswith('.xls') and '短直联投' in name:
|
212
|
+
# 短直联投
|
213
|
+
df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
|
214
|
+
df = pd.concat(df)
|
215
|
+
if len(df) == 0:
|
216
|
+
print(f'{name} 报表数据为空')
|
217
|
+
continue
|
218
|
+
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
219
|
+
db_name = '天猫数据2'
|
220
|
+
collection_name = '推广数据_短直联投'
|
221
|
+
elif name.endswith('.xls') and '视频加速推广' in name:
|
222
|
+
# 超级短视频
|
223
|
+
df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
|
224
|
+
df = pd.concat(df)
|
225
|
+
if len(df) == 0:
|
226
|
+
print(f'{name} 报表数据为空')
|
227
|
+
continue
|
228
|
+
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
229
|
+
db_name = '天猫数据2'
|
230
|
+
collection_name = '推广数据_超级短视频'
|
231
|
+
if '人群报表汇总' in name:
|
232
|
+
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=1, na_filter=False)
|
233
|
+
if len(df) == 0:
|
234
|
+
print(f'{name} 报表数据为空')
|
235
|
+
continue
|
236
|
+
db_name = '天猫数据2'
|
237
|
+
collection_name = '天猫_达摩盘_DMP报表'
|
238
|
+
# ----------------- 推广报表 分割线 -----------------
|
239
|
+
# ----------------- 推广报表 分割线 -----------------
|
240
|
+
date01 = re.findall(r'(\d{4}-\d{2}-\d{2})_\d{4}-\d{2}-\d{2}', str(name))
|
241
|
+
date02 = re.findall(r'\d{4}-\d{2}-\d{2}_(\d{4}-\d{2}-\d{2})', str(name))
|
242
|
+
attrib_pattern = re.findall(r'(\d+).xlsx', name) # 天猫商品素材表格, 必不可少
|
243
|
+
if name.endswith('.xls') and '生意参谋' in name and '无线店铺流量来源' in name:
|
244
|
+
# 无线店铺流量来源
|
245
|
+
df = pd.read_excel(os.path.join(root, name), header=5)
|
246
|
+
if len(df) == 0:
|
247
|
+
print(f'{name} 报表数据为空')
|
248
|
+
continue
|
249
|
+
# df.replace(to_replace=['-'], value=0, regex=False, inplace=True)
|
250
|
+
# df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
251
|
+
if date01[0] != date02[0]:
|
252
|
+
data_lis = date01[0] + '_' + date02[0]
|
253
|
+
df.insert(loc=0, column='数据周期', value=data_lis)
|
254
|
+
df.insert(loc=0, column='日期', value=date01[0])
|
255
|
+
# 2024-2-19 官方更新了推广渠道来源名称
|
256
|
+
df['三级来源'] = df['三级来源'].apply(
|
257
|
+
lambda x: '精准人群推广' if x == '精准人群推广(原引力魔方)'
|
258
|
+
else '关键词推广' if x == '关键词推广(原直通车)'
|
259
|
+
else '智能场景' if x == '智能场景(原万相台)'
|
260
|
+
else x
|
261
|
+
)
|
262
|
+
db_name = '生意参谋数据2'
|
263
|
+
if '经营优势' in df['一级来源'].tolist(): # 新版流量
|
264
|
+
if '数据周期' in df.columns.tolist():
|
265
|
+
collection_name='店铺来源_月数据_新版'
|
266
|
+
else:
|
267
|
+
collection_name='店铺来源_日数据_新版'
|
268
|
+
else: # 旧版流量
|
269
|
+
if '数据周期' in df.columns.tolist():
|
270
|
+
collection_name='店铺来源_月数据'
|
271
|
+
else:
|
272
|
+
collection_name='店铺来源_日数据'
|
273
|
+
elif name.endswith('.xls') and '生意参谋' in name and '商品_全部' in name:
|
274
|
+
# 店铺商品排行
|
275
|
+
df = pd.read_excel(os.path.join(root, name), header=4)
|
276
|
+
if len(df) == 0:
|
277
|
+
print(f'{name} 报表数据为空')
|
278
|
+
continue
|
279
|
+
# df.replace(to_replace=['-'], value=0, regex=False, inplace=True)
|
280
|
+
# df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
281
|
+
df.rename(columns={'统计日期': '日期', '商品ID': '商品id'}, inplace=True)
|
282
|
+
if date01[0] != date02[0]:
|
283
|
+
data_lis = date01[0] + '_' + date02[0]
|
284
|
+
df.insert(loc=1, column='数据周期', value=data_lis)
|
285
|
+
db_name = '生意参谋数据2'
|
286
|
+
collection_name = '商品排行'
|
287
|
+
elif name.endswith('.xls') and '参谋店铺整体日报' in name:
|
288
|
+
# 自助取数,店铺日报
|
289
|
+
df = pd.read_excel(os.path.join(root, name), header=7)
|
290
|
+
if len(df) == 0:
|
291
|
+
print(f'{name} 报表数据为空')
|
292
|
+
continue
|
293
|
+
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
294
|
+
db_name = '生意参谋数据2'
|
295
|
+
collection_name = '自助取数_整体日报'
|
296
|
+
elif name.endswith('.xls') and '参谋每日流量_自助取数_新版' in name:
|
297
|
+
# 自助取数,每日流量
|
298
|
+
df = pd.read_excel(os.path.join(root, name), header=7)
|
299
|
+
if len(df) == 0:
|
300
|
+
print(f'{name} 报表数据为空')
|
301
|
+
continue
|
302
|
+
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
303
|
+
# 2024-2-19 官方更新了推广渠道来源名称,自助取数没有更新,这里强制更改
|
304
|
+
df['三级来源'] = df['三级来源'].apply(
|
305
|
+
lambda x: '精准人群推广' if x == '引力魔方'
|
306
|
+
else '关键词推广' if x == '直通车'
|
307
|
+
else '智能场景' if x == '万相台'
|
308
|
+
else '精准人群推广' if x == '精准人群推广(原引力魔方)'
|
309
|
+
else '关键词推广' if x == '关键词推广(原直通车)'
|
310
|
+
else '智能场景' if x == '智能场景(原万相台)'
|
311
|
+
else x
|
312
|
+
)
|
313
|
+
db_name = '生意参谋数据2'
|
314
|
+
collection_name = '自助取数_每日流量'
|
315
|
+
elif name.endswith('.xls') and '商品sku' in name:
|
316
|
+
# 自助取数,商品sku
|
317
|
+
df = pd.read_excel(os.path.join(root, name), header=7)
|
318
|
+
if len(df) == 0:
|
319
|
+
print(f'{name} 报表数据为空')
|
320
|
+
continue
|
321
|
+
df.rename(columns={
|
322
|
+
'统计日期': '日期',
|
323
|
+
'商品ID': '商品id',
|
324
|
+
'SKU ID': 'sku id',
|
325
|
+
'商品SKU': '商品sku',
|
326
|
+
}, inplace=True)
|
327
|
+
db_name = '生意参谋数据2'
|
328
|
+
collection_name = '自助取数_商品sku'
|
329
|
+
elif name.endswith('.xls') and '参谋店铺流量来源(月)' in name:
|
330
|
+
# 自助取数,月店铺流量来源
|
331
|
+
df = pd.read_excel(os.path.join(root, name), header=7)
|
332
|
+
if len(df) == 0:
|
333
|
+
print(f'{name} 报表数据为空')
|
334
|
+
continue
|
335
|
+
df.rename(columns={'统计日期': '数据周期'}, inplace=True)
|
336
|
+
# 2024-2-19 官方更新了推广渠道来源名称,自助取数没有更新,这里强制更改
|
337
|
+
df['三级来源'] = df['三级来源'].apply(
|
338
|
+
lambda x: '精准人群推广' if x == '引力魔方'
|
339
|
+
else '关键词推广' if x == '直通车'
|
340
|
+
else '智能场景' if x == '万相台'
|
341
|
+
else '精准人群推广' if x == '精准人群推广(原引力魔方)'
|
342
|
+
else '关键词推广' if x == '关键词推广(原直通车)'
|
343
|
+
else '智能场景' if x == '智能场景(原万相台)'
|
344
|
+
else x
|
345
|
+
)
|
346
|
+
df['日期'] = df['数据周期'].apply(lambda x: re.findall('(.*) ~', x)[0])
|
347
|
+
db_name = '生意参谋数据2'
|
348
|
+
collection_name = '自助取数_店铺流量_月数据'
|
349
|
+
elif name.endswith('.csv') and 'baobei' in name:
|
350
|
+
# 生意经宝贝指标日数据
|
351
|
+
date = re.findall(r's-(\d{4})(\d{2})(\d{2})\.', str(name))
|
352
|
+
if not date: # 阻止月数据及已转换的表格
|
353
|
+
print(f'{name} 不支持或是已转换的表格')
|
354
|
+
# os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
355
|
+
continue
|
356
|
+
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
357
|
+
if len(df) == 0:
|
358
|
+
print(f'{name} 报表数据为空')
|
359
|
+
os.remove(os.path.join(root, name))
|
360
|
+
continue
|
361
|
+
if '日期' in df.columns.tolist():
|
362
|
+
df.pop('日期')
|
363
|
+
new_date = '-'.join(date[0])
|
364
|
+
df.insert(loc=0, column='日期', value=new_date)
|
365
|
+
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
366
|
+
db_name = '生意经2'
|
367
|
+
collection_name = '宝贝指标'
|
368
|
+
elif name.endswith('.csv') and '店铺销售指标' in name:
|
369
|
+
# 生意经, 店铺指标,仅限月数据,实际日指标也可以
|
370
|
+
name_st = re.findall(r'(.*)\(分日', name)
|
371
|
+
if not name_st:
|
372
|
+
print(f'{name} 已转换的表格')
|
373
|
+
continue
|
374
|
+
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
375
|
+
if len(df) == 0:
|
376
|
+
print(f'{name} 报表数据为空')
|
377
|
+
continue
|
378
|
+
df['日期'] = df['日期'].astype(str).apply(
|
379
|
+
lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', x)[0]) if x else x)
|
380
|
+
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
381
|
+
db_name = '生意经2'
|
382
|
+
collection_name = '店铺指标'
|
383
|
+
elif name.endswith('csv') and '省份' in name:
|
384
|
+
# 生意经,地域分布, 仅限日数据
|
385
|
+
pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})\.', name)
|
386
|
+
if not pattern or '省份城市分析2' not in name:
|
387
|
+
print(f'{name} 不支持或已转换的表格')
|
388
|
+
# os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
389
|
+
continue
|
390
|
+
date = '-'.join(pattern[0][1:])
|
391
|
+
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
392
|
+
if len(df) == 0:
|
393
|
+
print(f'{name} 报表数据为空')
|
394
|
+
continue
|
395
|
+
df['省'] = df['省份'].apply(lambda x: x if ' ├─ ' not in x and ' └─ ' not in x else None)
|
396
|
+
df['城市'] = df[['省份', '省']].apply(lambda x: '汇总' if x['省'] else x['省份'], axis=1)
|
397
|
+
df['省'].fillna(method='ffill', inplace=True)
|
398
|
+
df['城市'].replace(to_replace=[' ├─ | └─ '], value='', regex=True, inplace=True)
|
399
|
+
pov = df.pop('省')
|
400
|
+
city = df.pop('城市')
|
401
|
+
df.insert(loc=1, column='城市', value=city)
|
402
|
+
df.insert(loc=0, column='日期', value=date)
|
403
|
+
df['省份'] = pov
|
404
|
+
df['省+市'] = df[['省份', '城市']].apply(lambda x: f'{x["省份"]}-{x["城市"]}', axis=1)
|
405
|
+
db_name = '生意经2'
|
406
|
+
collection_name = '地域分布_省份城市分析'
|
407
|
+
elif name.endswith('csv') and 'order' in name:
|
408
|
+
# 生意经,订单数据,仅限月数据
|
409
|
+
pattern = re.findall(r'(.*)(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})', name)
|
410
|
+
if not pattern:
|
411
|
+
print(f'{name} 不支持或已转换的表格')
|
412
|
+
# os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
413
|
+
continue
|
414
|
+
date1 = pattern[0][1:4]
|
415
|
+
date1 = '-'.join(date1)
|
416
|
+
date2 = pattern[0][4:]
|
417
|
+
date2 = '-'.join(date2)
|
418
|
+
date = f'{date1}_{date2}'
|
419
|
+
df = pd.read_csv(os.path.join(root, name), encoding='gb18030', header=0, na_filter=False)
|
420
|
+
if len(df) == 0:
|
421
|
+
print(f'{name} 报表数据为空')
|
422
|
+
continue
|
423
|
+
df.insert(loc=0, column='日期', value=date1)
|
424
|
+
df.insert(loc=1, column='数据周期', value=date)
|
425
|
+
df['商品id'] = df['宝贝链接'].apply(
|
426
|
+
lambda x: re.sub('.*id=', '', x) if x else x)
|
427
|
+
df.rename(columns={'宝贝标题': '商品标题', '宝贝链接': '商品链接'}, inplace=True)
|
428
|
+
df['颜色编码'] = df['商家编码'].apply(
|
429
|
+
lambda x: ''.join(re.findall(r' .*(\d{4})$', str(x))) if x else x)
|
430
|
+
db_name = '生意经2'
|
431
|
+
collection_name = '订单数据'
|
432
|
+
elif name.endswith('.xlsx') and '直播间成交订单明细' in name:
|
433
|
+
# 直播间成交订单明细
|
434
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
435
|
+
if len(df) == 0:
|
436
|
+
print(f'{name} 报表数据为空')
|
437
|
+
continue
|
438
|
+
df.rename(columns={'场次ID': '场次id', '商品ID': '商品id'}, inplace=True)
|
439
|
+
df['日期'] = df['支付时间'].apply(lambda x: x.strftime('%Y-%m-%d'))
|
440
|
+
db_name = '生意参谋数据2'
|
441
|
+
collection_name = '直播间成交订单明细'
|
442
|
+
elif name.endswith('.xlsx') and '直播间大盘数据' in name:
|
443
|
+
# 直播间大盘数据
|
444
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
445
|
+
if len(df) == 0:
|
446
|
+
print(f'{name} 报表数据为空')
|
447
|
+
continue
|
448
|
+
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
449
|
+
db_name = '生意参谋数据2'
|
450
|
+
collection_name = '直播间大盘数据'
|
451
|
+
elif name.endswith('.xls') and '直播业绩-成交拆解' in name:
|
452
|
+
# 直播业绩-成交拆解
|
453
|
+
df = pd.read_excel(os.path.join(root, name), header=5)
|
454
|
+
if len(df) == 0:
|
455
|
+
print(f'{name} 报表数据为空')
|
456
|
+
continue
|
457
|
+
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
458
|
+
db_name = '生意参谋数据2'
|
459
|
+
collection_name = '直播业绩'
|
460
|
+
elif name.endswith('.csv') and '淘宝店铺数据' in name:
|
461
|
+
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
462
|
+
db_name = '市场数据2'
|
463
|
+
collection_name = '淘宝店铺数据'
|
464
|
+
elif name.endswith('.csv') and '人群洞察' in name:
|
465
|
+
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
466
|
+
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
467
|
+
df = df[df['人群规模'] != '']
|
468
|
+
if len(df) == 0:
|
469
|
+
continue
|
470
|
+
db_name = '天猫数据2'
|
471
|
+
collection_name = '万相台_人群洞察'
|
472
|
+
elif name.endswith('.csv') and '客户_客户概况_画像' in name:
|
473
|
+
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
474
|
+
db_name = '生意参谋数据2'
|
475
|
+
collection_name = '客户_客户概况_画像'
|
476
|
+
elif name.endswith('.csv') and '市场排行_店铺' in name:
|
477
|
+
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
478
|
+
db_name = '市场数据2'
|
479
|
+
collection_name = '市场排行_店铺'
|
480
|
+
elif name.endswith('.csv') and '类目洞察_属性分析' in name:
|
481
|
+
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
482
|
+
db_name = '市场数据2'
|
483
|
+
collection_name = '类目洞察_属性分析'
|
484
|
+
elif name.endswith('.csv') and '类目洞察_价格分析' in name:
|
485
|
+
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
486
|
+
db_name = '市场数据2'
|
487
|
+
collection_name = '类目洞察_价格分析'
|
488
|
+
elif name.endswith('.csv') and '竞店分析-销售分析' in name:
|
489
|
+
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
490
|
+
db_name = '市场数据2'
|
491
|
+
collection_name = '竞店分析_销售分析'
|
492
|
+
elif name.endswith('.csv') and '竞店分析-来源分析' in name:
|
493
|
+
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
494
|
+
db_name = '市场数据2'
|
495
|
+
collection_name = '竞店分析_来源分析'
|
496
|
+
# ----------------------- 京东数据处理分界线 -----------------------
|
497
|
+
# ----------------------- 京东数据处理分界线 -----------------------
|
498
|
+
elif name.endswith('.xlsx') and '店铺来源_流量来源' in name:
|
499
|
+
# 京东店铺来源
|
500
|
+
if '按天' not in name:
|
501
|
+
print(f'{name} 京东流量请按天下载')
|
502
|
+
continue
|
503
|
+
date01 = re.findall(r'(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
|
504
|
+
new_date01 = f'{date01[0][0]}-{date01[0][1]}-{date01[0][2]}'
|
505
|
+
new_date02 = f'{date01[0][3]}-{date01[0][4]}-{date01[0][5]}'
|
506
|
+
new_date03 = f'{new_date01}_{new_date02}'
|
507
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
508
|
+
if len(df) == 0:
|
509
|
+
print(f'{name} 报表数据为空')
|
510
|
+
continue
|
511
|
+
df.insert(loc=0, column='日期', value=new_date01)
|
512
|
+
if new_date01 != new_date02:
|
513
|
+
df.insert(loc=1, column='数据周期', value=new_date03)
|
514
|
+
cols = df.columns.tolist()
|
515
|
+
for col_2024 in cols: # 京东这个表有字段加了去年日期,删除这些同比数据字段,不然列数量爆炸
|
516
|
+
if '20' in col_2024 and '流量来源' in name:
|
517
|
+
df.drop(col_2024, axis=1, inplace=True)
|
518
|
+
db_name = '京东数据2'
|
519
|
+
collection_name = '流量来源_日数据'
|
520
|
+
elif name.endswith('.xlsx') and '全部渠道_商品明细' in name:
|
521
|
+
# 京东商品明细 文件转换
|
522
|
+
date1 = re.findall(r'_(\d{4})(\d{2})(\d{2})_全部', str(name))
|
523
|
+
if not date1[0]:
|
524
|
+
print(f'{name}: 仅支持日数据')
|
525
|
+
continue
|
526
|
+
if date1:
|
527
|
+
date1 = f'{date1[0][0]}-{date1[0][1]}-{date1[0][2]}'
|
528
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
529
|
+
if len(df) == 0:
|
530
|
+
print(f'{name} 报表数据为空')
|
531
|
+
continue
|
532
|
+
if '10035975359247' in df['商品ID'].values or '10056642622343' in df['商品ID'].values:
|
533
|
+
new_name = f'sku_{date1}_全部渠道_商品明细.csv'
|
534
|
+
elif '10021440233518' in df['商品ID'].values or '10022867813485' in df['商品ID'].values:
|
535
|
+
new_name = f'spu_{date1}_全部渠道_商品明细.csv'
|
536
|
+
else:
|
537
|
+
new_name = f'未分类_{date1}_全部渠道_商品明细.csv'
|
538
|
+
df.rename(columns={'商品ID': '商品id'}, inplace=True)
|
539
|
+
df.insert(loc=0, column='日期', value=date1)
|
540
|
+
if 'sku' in new_name:
|
541
|
+
db_name = '京东数据2'
|
542
|
+
collection_name = 'sku_商品明细'
|
543
|
+
elif 'spu' in new_name:
|
544
|
+
db_name = '京东数据2'
|
545
|
+
collection_name = 'spu_商品明细'
|
546
|
+
elif name.endswith('.xlsx') and '搜索分析-排名定位-商品词下排名' in name:
|
547
|
+
# 京东商品词下排名
|
548
|
+
df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
|
549
|
+
if len(df) == 0:
|
550
|
+
print(f'{name} 报表数据为空')
|
551
|
+
continue
|
552
|
+
df.rename(columns={'商品的ID': 'skuid'}, inplace=True)
|
553
|
+
for col in ['词人气', '搜索点击率']:
|
554
|
+
if col in df.columns.tolist():
|
555
|
+
df[col] = df[col].apply(lambda x: round(x, 6) if x else x)
|
556
|
+
db_name = '京东数据2'
|
557
|
+
collection_name = '商品词下排名'
|
558
|
+
elif name.endswith('.xlsx') and '搜索分析-排名定位-商品排名' in name:
|
559
|
+
# 京东商品排名
|
560
|
+
date_in = re.findall(r'(\d{4}-\d{2}-\d{2})-搜索', str(name))[0]
|
561
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
562
|
+
if len(df) == 0:
|
563
|
+
print(f'{name} 报表数据为空')
|
564
|
+
continue
|
565
|
+
df.insert(0, '日期', date_in) # 插入新列
|
566
|
+
df.rename(columns={'SKU': 'skuid'}, inplace=True)
|
567
|
+
if '点击率' in df.columns.tolist():
|
568
|
+
df['点击率'] = df['点击率'].apply(lambda x: round(x, 6) if x else x)
|
569
|
+
db_name = '京东数据2'
|
570
|
+
collection_name = '商品排名'
|
571
|
+
elif name.endswith('.xls') and '竞店概况_竞店详情' in name:
|
572
|
+
# 京东,竞争-竞店概况-竞店详情-全部渠道
|
573
|
+
date01 = re.findall(r'全部渠道_(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
|
574
|
+
start_date = f'{date01[0][0]}-{date01[0][1]}-{date01[0][2]}'
|
575
|
+
# end_date = f'{date01[0][3]}-{date01[0][4]}-{date01[0][5]}'
|
576
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
577
|
+
if len(df) == 0:
|
578
|
+
print(f'{name} 报表数据为空')
|
579
|
+
continue
|
580
|
+
df.insert(loc=0, column='日期', value=start_date)
|
581
|
+
db_name = '京东数据2'
|
582
|
+
collection_name = '竞店监控_日数据'
|
583
|
+
elif name.endswith('.xls') and '店铺' in name:
|
584
|
+
# 京东 自助报表 店铺日报
|
585
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
586
|
+
if len(df) == 0:
|
587
|
+
print(f'{name} 报表数据为空')
|
588
|
+
continue
|
589
|
+
df['日期'] = df['日期'].apply(
|
590
|
+
lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', str(x))[0])
|
591
|
+
)
|
592
|
+
db_name = '京东数据2'
|
593
|
+
collection_name = '京东_自助取数_店铺日报'
|
594
|
+
elif name.endswith('.xls') and '商家榜单_女包_整体' in name:
|
595
|
+
# 京东 行业 商家榜单
|
596
|
+
date2 = re.findall(r'_\d{8}-\d+', name)
|
597
|
+
if date2:
|
598
|
+
print(f'{name}: 请下载日数据,不支持其他周期')
|
599
|
+
# os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
600
|
+
continue
|
601
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
602
|
+
if len(df) == 0:
|
603
|
+
print(f'{name} 报表数据为空')
|
604
|
+
continue
|
605
|
+
df['日期'] = df['日期'].astype(str).apply(lambda x: f'{x[:4]}-{x[4:6]}-{x[6:8]}')
|
606
|
+
df.insert(loc=0, column='类型', value='商家榜单')
|
607
|
+
db_name = '京东数据2'
|
608
|
+
collection_name = '商家榜单'
|
609
|
+
elif name.endswith('.xlsx') and '批量SKU导出-批量任务' in name:
|
610
|
+
# 京东 sku 导出
|
611
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
612
|
+
if len(df) == 0:
|
613
|
+
print(f'{name} 报表数据为空')
|
614
|
+
continue
|
615
|
+
d_time = datetime.datetime.today().strftime('%Y-%m-%d')
|
616
|
+
df.insert(loc=0, column='日期', value=d_time)
|
617
|
+
df['商品链接'] = df['商品链接'].apply(lambda x: f'https://{x}' if x else x)
|
618
|
+
db_name = '属性设置2'
|
619
|
+
collection_name = '京东sku商品信息'
|
620
|
+
elif name.endswith('.xlsx') and '批量SPU导出-批量任务' in name:
|
621
|
+
# 京东 spu 导出
|
622
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
623
|
+
if len(df) == 0:
|
624
|
+
print(f'{name} 报表数据为空')
|
625
|
+
continue
|
626
|
+
d_time = datetime.datetime.today().strftime('%Y-%m-%d')
|
627
|
+
df.insert(loc=0, column='日期', value=d_time)
|
628
|
+
db_name = '属性设置2'
|
629
|
+
collection_name = '京东spu商品信息'
|
630
|
+
elif name.endswith('.csv') and '万里马箱包推广1_完整点击成交' in name:
|
631
|
+
# 京东推广数据
|
632
|
+
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
633
|
+
if len(df) == 0:
|
634
|
+
print(f'{name} 报表数据为空')
|
635
|
+
continue
|
636
|
+
df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
|
637
|
+
db_name = '京东数据2'
|
638
|
+
collection_name = '推广数据_京准通'
|
639
|
+
elif name.endswith('.csv') and '万里马箱包推广1_京东推广搜索词_pbix同步不要' in name:
|
640
|
+
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
641
|
+
if len(df) == 0:
|
642
|
+
print(f'{name} 报表数据为空')
|
643
|
+
continue
|
644
|
+
df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
|
645
|
+
df['是否品牌词'] = df['搜索词'].str.contains('万里马|wanlima', regex=True)
|
646
|
+
df['是否品牌词'] = df['是否品牌词'].apply(lambda x: '品牌词' if x else '')
|
647
|
+
db_name = '京东数据2'
|
648
|
+
collection_name = '推广数据_搜索词报表'
|
649
|
+
elif name.endswith('.xlsx') and '零售明细统计' in name:
|
650
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
651
|
+
if len(df) == 0:
|
652
|
+
print(f'{name} 报表数据为空')
|
653
|
+
continue
|
654
|
+
df = df[df['缩略图'] != '合计']
|
655
|
+
db_name = '生意经2'
|
656
|
+
collection_name = 'E3_零售明细统计'
|
657
|
+
|
658
|
+
# 商品素材,必须保持放在最后处理
|
659
|
+
elif name.endswith('xlsx'):
|
660
|
+
"""从天猫商品素材库中下载的文件,将文件修改日期添加到DF 和文件名中"""
|
661
|
+
if attrib_pattern:
|
662
|
+
df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
|
663
|
+
cols = df.columns.tolist()
|
664
|
+
if '商品白底图' in cols and '方版场景图' in cols:
|
665
|
+
f_info = os.stat(os.path.join(root, name)) # 读取文件的 stat 信息
|
666
|
+
mtime = time.strftime('%Y-%m-%d', time.localtime(f_info.st_mtime)) # 读取文件创建日期
|
667
|
+
df['日期'] = mtime
|
668
|
+
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
|
669
|
+
df.rename(columns={'商品ID': '商品id'}, inplace=True)
|
670
|
+
sp_id = df['商品id'].tolist()
|
671
|
+
if 652737455554 in sp_id or 683449516249 in sp_id or 37114359548 in sp_id or 570735930393 in sp_id:
|
672
|
+
df.insert(0, '店铺名称', '万里马官方旗舰店') # 插入新列
|
673
|
+
elif 704624764420 in sp_id or 701781021639 in sp_id or 520380314717 in sp_id:
|
674
|
+
df.insert(0, '店铺名称', '万里马官方企业店') # 插入新列
|
675
|
+
else:
|
676
|
+
df.insert(0, '店铺名称', 'coome旗舰店') # 插入新列
|
677
|
+
db_name = '属性设置2'
|
678
|
+
collection_name = '商品素材导出'
|
679
|
+
|
680
|
+
if is_move:
|
681
|
+
try:
|
682
|
+
os.remove(os.path.join(root, name)) # 是否移除原文件
|
683
|
+
except Exception as e:
|
684
|
+
print(f'{name}, {e}')
|
685
|
+
if len(df) > 0:
|
686
|
+
# 创建包含 dtypes 信息的 json 文件
|
687
|
+
json_data.read_dtypes(
|
688
|
+
df=df,
|
689
|
+
db_name=db_name,
|
690
|
+
collection_name=collection_name,
|
691
|
+
is_file_dtype=True, # 默认本地文件优先: True
|
692
|
+
)
|
693
|
+
# 将数据传入 self.datas 等待更新进数据库
|
694
|
+
self.datas.append(
|
695
|
+
{
|
696
|
+
'数据库名': db_name,
|
697
|
+
'集合名称': collection_name,
|
698
|
+
'数据主体': df,
|
699
|
+
}
|
700
|
+
)
|
701
|
+
json_data.dtypes_to_file() # 写入 json 文件, 包含数据的 dtypes 信息
|
702
|
+
|
703
|
+
# 品销宝一个表格里面包含多个 sheet, 最好是单独处理
|
704
|
+
json_data = DataTypes() # json 文件, 包含数据的 dtypes 信息
|
705
|
+
for root, dirs, files in os.walk(self.path, topdown=False):
|
706
|
+
for name in files:
|
707
|
+
if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
|
708
|
+
continue
|
709
|
+
# df = pd.DataFrame()
|
710
|
+
if name.endswith('.xlsx') and '明星店铺' in name:
|
711
|
+
# 品销宝
|
712
|
+
pattern = re.findall(r'_(\d{4}-\d{2}-\d{2})_', name)
|
713
|
+
if pattern:
|
714
|
+
continue
|
715
|
+
sheets4 = ['账户', '推广计划', '推广单元', '创意', '品牌流量包', '定向人群'] # 品销宝
|
716
|
+
file_name4 = os.path.splitext(name)[0] # 明星店铺报表
|
717
|
+
for sheet4 in sheets4:
|
718
|
+
df = pd.read_excel(os.path.join(root, name), sheet_name=sheet4, header=0, engine='openpyxl')
|
719
|
+
df = df[df['搜索量'] > 0]
|
720
|
+
if len(df) < 1:
|
721
|
+
# print(f'{name}/{sheet4} 跳过')
|
722
|
+
continue
|
723
|
+
df.insert(loc=1, column='报表类型', value=sheet4)
|
724
|
+
db_name = '天猫数据2'
|
725
|
+
collection_name = f'推广数据_品销宝_{sheet4}'
|
726
|
+
json_data.read_dtypes(
|
727
|
+
df=df,
|
728
|
+
db_name=db_name,
|
729
|
+
collection_name=collection_name,
|
730
|
+
is_file_dtype=False,
|
731
|
+
)
|
732
|
+
self.datas.append(
|
733
|
+
{
|
734
|
+
'数据库名': db_name,
|
735
|
+
'集合名称': collection_name,
|
736
|
+
'数据主体': df,
|
737
|
+
}
|
738
|
+
)
|
739
|
+
if is_move:
|
740
|
+
os.remove(os.path.join(root, name))
|
741
|
+
json_data.dtypes_to_file() # 写入 json 文件, 包含数据的 dtypes 信息
|
742
|
+
|
743
|
+
def upload_df(self, service_databases=[{}]):
|
744
|
+
"""
|
745
|
+
将清洗后的 df 上传数据库
|
746
|
+
"""
|
747
|
+
for service_database in service_databases:
|
748
|
+
for service_name, database in service_database.items():
|
749
|
+
# print(service_name, database)
|
750
|
+
if database == 'mongodb':
|
751
|
+
username, password, host, port = get_myconf.select_config_values(
|
752
|
+
target_service=service_name,
|
753
|
+
database=database,
|
754
|
+
)
|
755
|
+
d = mongo.UploadMongo(
|
756
|
+
username=username,
|
757
|
+
password=password,
|
758
|
+
host=host,
|
759
|
+
port=port,
|
760
|
+
drop_duplicates=False,
|
761
|
+
)
|
762
|
+
for data in self.datas:
|
763
|
+
df, db_name, collection_name = data['数据主体'], data['数据库名'], data['集合名称']
|
764
|
+
d.df_to_mongo(df=df, db_name=db_name, collection_name=collection_name)
|
765
|
+
|
766
|
+
elif database == 'mysql':
|
767
|
+
username, password, host, port = get_myconf.select_config_values(
|
768
|
+
target_service=service_name,
|
769
|
+
database=database,
|
770
|
+
)
|
771
|
+
m = mysql.MysqlUpload(
|
772
|
+
username=username,
|
773
|
+
password=password,
|
774
|
+
host=host,
|
775
|
+
port=port,
|
776
|
+
)
|
777
|
+
for data in self.datas:
|
778
|
+
df, db_name, collection_name = data['数据主体'], data['数据库名'], data['集合名称']
|
779
|
+
m.df_to_mysql(df=df, db_name=db_name, tabel_name=collection_name)
|
780
|
+
|
781
|
+
def new_unzip(self, path=None, is_move=None):
|
782
|
+
"""
|
783
|
+
{解压并移除zip文件}
|
784
|
+
如果是京东的商品明细,处理过程:
|
785
|
+
1. 读取 zip包的文件名
|
786
|
+
2. 组合完整路径,判断文件夹下是否已经有同名文件
|
787
|
+
3. 如果有,则将该同名文件改名,(从文件名中提取日期,重新拼接文件名)
|
788
|
+
4. 然后解压 zip包
|
789
|
+
5. 需要用 _jd_rename 继续重命名刚解压的文件
|
790
|
+
is_move 参数, 是否移除 下载目录的所有zip 文件
|
791
|
+
"""
|
792
|
+
if not path:
|
793
|
+
path = self.path
|
794
|
+
res_names = [] # 需要移除的压缩文件
|
795
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
796
|
+
for name in files:
|
797
|
+
if '~$' in name or 'DS_Store' in name or 'baidu' in name or 'xunlei' in name:
|
798
|
+
continue
|
799
|
+
if name.endswith('.zip'):
|
800
|
+
old_file = os.path.join(root, name)
|
801
|
+
f = zipfile.ZipFile(old_file, 'r')
|
802
|
+
if len(f.namelist()) == 1: # 压缩包只有一个文件的情况
|
803
|
+
for zip_name in f.namelist(): # 读取zip内的文件名称
|
804
|
+
# zip_name_1 = zip_name.encode('cp437').decode('utf-8')
|
805
|
+
try:
|
806
|
+
zip_name_1 = zip_name.encode('utf-8').decode('utf-8')
|
807
|
+
except:
|
808
|
+
zip_name_1 = zip_name.encode('cp437').decode('utf-8')
|
809
|
+
new_path = os.path.join(root, zip_name_1) # 拼接解压后的文件路径
|
810
|
+
if os.path.isfile(new_path) and '全部渠道_商品明细' in new_path: # 是否存在和包内同名的文件
|
811
|
+
# 专门处理京东文件
|
812
|
+
df = pd.read_excel(new_path)
|
813
|
+
try:
|
814
|
+
pattern1 = re.findall(r'\d{8}_(\d{4})(\d{2})(\d{2})_全部渠道_商品明细',
|
815
|
+
name)
|
816
|
+
pattern2 = re.findall(
|
817
|
+
r'\d{8}_(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})_全部渠道_商品明细',
|
818
|
+
name)
|
819
|
+
if pattern1:
|
820
|
+
year_date = '-'.join(list(pattern1[0])) + '_' + '-'.join(list(pattern1[0]))
|
821
|
+
elif pattern2:
|
822
|
+
year_date = '-'.join(list(pattern2[0])[0:3]) + '_' + '-'.join(
|
823
|
+
list(pattern2[0])[3:7])
|
824
|
+
else:
|
825
|
+
year_date = '无法提取日期'
|
826
|
+
print(f'{name} 无法从文件名中提取日期,请检查pattern或文件')
|
827
|
+
if ('10035975359247' in df['商品ID'].values or '10056642622343' in
|
828
|
+
df['商品ID'].values):
|
829
|
+
os.rename(new_path,
|
830
|
+
os.path.join(root, 'sku_' + year_date + '_全部渠道_商品明细.xls'))
|
831
|
+
f.extract(zip_name_1, root)
|
832
|
+
elif ('10021440233518' in df['商品ID'].values or '10022867813485' in
|
833
|
+
df['商品ID'].values):
|
834
|
+
os.rename(new_path,
|
835
|
+
os.path.join(root, 'spu_' + year_date + '_全部渠道_商品明细.xls'))
|
836
|
+
f.extract(zip_name_1, root)
|
837
|
+
if is_move:
|
838
|
+
os.remove(os.path.join(root, name))
|
839
|
+
except Exception as e:
|
840
|
+
print(e)
|
841
|
+
continue
|
842
|
+
else:
|
843
|
+
f.extract(zip_name, root)
|
844
|
+
if zip_name_1 != zip_name:
|
845
|
+
os.rename(os.path.join(root, zip_name), os.path.join(root, zip_name_1))
|
846
|
+
if is_move:
|
847
|
+
res_names.append(name)
|
848
|
+
# os.remove(os.path.join(root, name)) # 这里不能移除,会提示文件被占用
|
849
|
+
f.close()
|
850
|
+
else: # 压缩包内包含多个文件的情况
|
851
|
+
f.close()
|
852
|
+
self.unzip_all(path=old_file, save_path=path)
|
853
|
+
|
854
|
+
if is_move:
|
855
|
+
for name in res_names:
|
856
|
+
os.remove(os.path.join(path, name))
|
857
|
+
print(f'移除{os.path.join(path, name)}')
|
858
|
+
|
859
|
+
def unzip_all(self, path, save_path):
|
860
|
+
"""
|
861
|
+
遍历目录, 重命名有乱码的文件
|
862
|
+
2. 如果压缩包是文件夹, 则保存到新文件夹,并删除有乱码的文件夹
|
863
|
+
3. 删除MAC系统的临时文件夹__MACOSX
|
864
|
+
"""
|
865
|
+
with PyZipFile(path) as _f:
|
866
|
+
_f.extractall(save_path)
|
867
|
+
_f.close()
|
868
|
+
for _root, _dirs, _files in os.walk(save_path, topdown=False):
|
869
|
+
for _name in _files:
|
870
|
+
if '~$' in _name or 'DS_Store' in _name:
|
871
|
+
continue
|
872
|
+
try:
|
873
|
+
_new_root = _root.encode('cp437').decode('utf-8')
|
874
|
+
_new_name = _name.encode('cp437').decode('utf-8')
|
875
|
+
except:
|
876
|
+
_new_root = _root.encode('utf-8').decode('utf-8')
|
877
|
+
_new_name = _name.encode('utf-8').decode('utf-8')
|
878
|
+
_old = os.path.join(_root, _name)
|
879
|
+
_new = os.path.join(_new_root, _new_name)
|
880
|
+
if _new_root != _root: # 目录乱码,创建新目录
|
881
|
+
os.makedirs(_new_root, exist_ok=True)
|
882
|
+
os.rename(_old, _new)
|
883
|
+
try:
|
884
|
+
_new_root = _root.encode('cp437').decode('utf-8')
|
885
|
+
except:
|
886
|
+
_new_root = _root.encode('utf-8').decode('utf-8')
|
887
|
+
if _new_root != _root or '__MACOSX' in _root:
|
888
|
+
shutil.rmtree(_root)
|
889
|
+
|
890
|
+
def get_encoding(self, file_path):
|
891
|
+
"""
|
892
|
+
获取文件的编码方式, 读取速度比较慢,非必要不要使用
|
893
|
+
"""
|
894
|
+
with open(file_path, 'rb') as f:
|
895
|
+
f1 = f.read()
|
896
|
+
encod = chardet.detect(f1).get('encoding')
|
897
|
+
return encod
|
898
|
+
|
899
|
+
|
900
|
+
def update_dtypte():
|
901
|
+
""" 更新一个文件的 dtype 信息到 json 文件 """
|
902
|
+
file = '/Users/xigua/数据中心/原始文件2/月数据/流量来源/【生意参谋平台】无线店铺流量来源-2023-04-01_2023-04-30.csv'
|
903
|
+
df = pd.read_csv(file, encoding='utf-8_sig', header=0, na_filter=False)
|
904
|
+
d = DataTypes()
|
905
|
+
d.read_dtypes(
|
906
|
+
df=df,
|
907
|
+
db_name='生意参谋数据2',
|
908
|
+
collection_name='店铺来源_月数据',
|
909
|
+
is_file_dtype=False, # 关闭文件优先
|
910
|
+
)
|
911
|
+
d.dtypes_to_file()
|
912
|
+
|
913
|
+
|
914
|
+
def upload():
|
915
|
+
""" 上传一个文件夹到数据库 """
|
916
|
+
path = '/Users/xigua/数据中心/原始文件2/生意经/店铺指标'
|
917
|
+
db_name = '生意经2'
|
918
|
+
collection_name = '店铺指标'
|
919
|
+
|
920
|
+
username, password, host, port = get_myconf.select_config_values(
|
921
|
+
target_service='home_lx',
|
922
|
+
database='mongodb',
|
923
|
+
)
|
924
|
+
d = mongo.UploadMongo(
|
925
|
+
username=username,
|
926
|
+
password=password,
|
927
|
+
host=host,
|
928
|
+
port=port,
|
929
|
+
drop_duplicates=False,
|
930
|
+
)
|
931
|
+
username, password, host, port = get_myconf.select_config_values(
|
932
|
+
target_service='home_lx',
|
933
|
+
database='mysql',
|
934
|
+
)
|
935
|
+
m = mysql.MysqlUpload(
|
936
|
+
username=username,
|
937
|
+
password=password,
|
938
|
+
host=host,
|
939
|
+
port=port,
|
940
|
+
)
|
941
|
+
username, password, host, port = get_myconf.select_config_values(
|
942
|
+
target_service='nas',
|
943
|
+
database='mysql',
|
944
|
+
)
|
945
|
+
nas = mysql.MysqlUpload(
|
946
|
+
username=username,
|
947
|
+
password=password,
|
948
|
+
host=host,
|
949
|
+
port=port,
|
950
|
+
)
|
951
|
+
|
952
|
+
dt = DataTypes()
|
953
|
+
dtypes = dt.load_dtypes(
|
954
|
+
db_name=db_name,
|
955
|
+
collection_name=collection_name,
|
956
|
+
)
|
957
|
+
# print(dtypes)
|
958
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
959
|
+
for name in files:
|
960
|
+
if '~$' in name or '.DS' in name or '.localized' in name or 'baidu' in name:
|
961
|
+
continue
|
962
|
+
if name.endswith('.csv'):
|
963
|
+
# print(name)
|
964
|
+
try:
|
965
|
+
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
966
|
+
if len(df) == 0:
|
967
|
+
continue
|
968
|
+
cv = converter.DataFrameConverter()
|
969
|
+
df = cv.convert_df_cols(df=df) # 清理列名和 df 中的非法字符
|
970
|
+
try:
|
971
|
+
df = df.astype(dtypes)
|
972
|
+
except Exception as e:
|
973
|
+
print(name, e)
|
974
|
+
old_dt = df.dtypes.apply(str).to_dict() # 将 dataframe 数据类型转为字典形式
|
975
|
+
intersection_keys = dtypes.keys() & old_dt.keys() # 获取两个字典键的交集
|
976
|
+
dtypes = {k: dtypes[k] for k in intersection_keys} # 使用交集的键创建新字典
|
977
|
+
df = df.astype(dtypes)
|
978
|
+
# print(intersection_dict)
|
979
|
+
# print(df)
|
980
|
+
|
981
|
+
d.df_to_mongo(df=df, db_name=db_name, collection_name=collection_name)
|
982
|
+
m.df_to_mysql(df=df, db_name=db_name, tabel_name=collection_name)
|
983
|
+
nas.df_to_mysql(df=df, db_name=db_name, tabel_name=collection_name)
|
984
|
+
except Exception as e:
|
985
|
+
print(name, e)
|
986
|
+
if d.client:
|
987
|
+
d.client.close() # 必须手动关闭数据库连接
|
988
|
+
|
989
|
+
|
990
|
+
def main():
|
991
|
+
d = DatabaseUpdate(path='/Users/xigua/Downloads')
|
992
|
+
d.new_unzip(is_move=True)
|
993
|
+
d.cleaning(is_move=False)
|
994
|
+
d.upload_df(service_databases=[{'home_lx': 'mongodb'}, {'home_lx': 'mysql'}])
|
995
|
+
# print(d.datas)
|
996
|
+
|
997
|
+
|
998
|
+
if __name__ == '__main__':
|
999
|
+
# username, password, host, port = get_myconf.select_config_values(target_service='nas', database='mysql')
|
1000
|
+
# print(username, password, host, port)
|
1001
|
+
|
1002
|
+
# main()
|
1003
|
+
upload()
|