mdbq 0.0.2__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1321 @@
1
+ # -*- coding:utf-8 -*-
2
+ import warnings
3
+ import pandas as pd
4
+ from functools import wraps
5
+ import chardet
6
+ import zipfile
7
+ from pyzipper import PyZipFile
8
+ import os
9
+ import platform
10
+ import pathlib
11
+ import json
12
+ from mdbq.mongo import mongo
13
+ from mdbq.mysql import mysql
14
+ from mdbq.config import get_myconf
15
+ import datetime
16
+ import time
17
+ import re
18
+ import shutil
19
+ import getpass
20
+
21
+ warnings.filterwarnings('ignore')
22
+
23
+
24
+ class DataClean:
25
+ """ 数据分类 """
26
+
27
+ def __init__(self, path, source_path):
28
+ self.path = path
29
+ self.source_path = source_path
30
+ self.set_up_to_mogo: bool = True # 不设置则不上传 mongodb
31
+ self.set_up_to_mysql: bool = True # 不设置则不上传 mysql
32
+
33
+ def __call__(self, *args, **kwargs):
34
+ self.new_unzip(path=self.path, is_move=True) # 解压文件
35
+ self.change_and_sort(path=self.path)
36
+
37
+ self.move_all(path=self.path) # 移到文件到原始文件夹
38
+ self.attribute(path=self.path) # 商品素材重命名和分类
39
+
40
+ @staticmethod
41
+ def try_except(func): # 在类内部定义一个异常处理方法
42
+ @wraps(func)
43
+ def wrapper(*args, **kwargs):
44
+ try:
45
+ return func(*args, **kwargs)
46
+ except Exception as e:
47
+ print(f'{func.__name__}, {e}') # 将异常信息返回
48
+
49
+ return wrapper
50
+
51
+ @staticmethod
52
+ def get_encoding(file_path):
53
+ """
54
+ 获取文件的编码方式, 读取速度比较慢,非必要不要使用
55
+ """
56
+ with open(file_path, 'rb') as f:
57
+ f1 = f.read()
58
+ encod = chardet.detect(f1).get('encoding')
59
+ return encod
60
+
61
+ @staticmethod
62
+ def save_to_csv(_df, _save_paths, filenames, encoding='utf-8_sig'):
63
+ if '.csv' not in filenames:
64
+ filenames = f'{filenames}.csv'
65
+ if not os.path.exists(_save_paths):
66
+ os.makedirs(_save_paths, exist_ok=True)
67
+ _df.to_csv(os.path.join(_save_paths, filenames), encoding=encoding, index=False, header=True)
68
+
69
+ # @try_except
70
+ def change_and_sort(self, path=None):
71
+ """数据转换"""
72
+ if not path:
73
+ path = self.path
74
+
75
+ if self.set_up_to_mogo:
76
+ username, password, host, port = get_myconf.select_config_values(target_service='home_lx',
77
+ database='mongodb')
78
+ d = mongo.UploadMongo(username=username, password=password, host=host, port=port,
79
+ drop_duplicates=False
80
+ )
81
+ if self.set_up_to_mysql:
82
+ username, password, host, port = get_myconf.select_config_values(target_service='home_lx', database='mysql')
83
+ m = mysql.MysqlUpload(username=username, password=password, host=host, port=port)
84
+
85
+ for root, dirs, files in os.walk(path, topdown=False):
86
+ for name in files:
87
+ if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
88
+ continue
89
+ encoding = self.get_encoding(file_path=pathlib.Path(root, name))
90
+ # ----------------- 推广报表 分割线 -----------------
91
+ tg_names = ['账户报表', '计划报表', '单元报表', '关键词报表', '人群报表', '宝贝主体报表',
92
+ '其他主体报表',
93
+ '创意报表', '地域报表', '权益报表']
94
+ for tg_name in tg_names:
95
+ if tg_name in name and '汇总' not in name and name.endswith('.csv'): # 人群报表排除达摩盘报表: 人群报表汇总
96
+ pattern = re.findall(r'(.*_)\d{8}_\d{6}', name)
97
+ if not pattern: # 说明已经转换过
98
+ continue
99
+ shop_name = re.findall(r'\d{8}_\d{6}_(.*)\W', name)
100
+ if shop_name:
101
+ shop_name = shop_name[0]
102
+ else:
103
+ shop_name = ''
104
+ df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
105
+ if '地域' not in name: # 除了地域报表, 检查数据的字段是否包含“场景名字”,如果没有,说明没有选“pbix” 数据模块下载
106
+ ck = df.columns.tolist()
107
+ if '场景名字' not in ck:
108
+ print(f'{name} 报表字段缺失, 请选择Pbix数据模板下载')
109
+ continue
110
+ if len(df) == 0:
111
+ print(f'{name} 报表是空的, 请重新下载, 此报表已移除')
112
+ os.remove(os.path.join(root, name))
113
+ continue
114
+
115
+ df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
116
+ df.fillna(0, inplace=True)
117
+ col_ids = ['场景ID', '计划ID', '单元ID', '主体ID', '宝贝ID', '词ID/词包ID', '创意ID']
118
+ sb = df.columns.tolist()
119
+ if '日期' not in sb:
120
+ print(f'{name} 注意:该报表不包含分日数据,数据不会保存,请重新下载!')
121
+ continue
122
+ if '省' in sb:
123
+ if '市' not in sb:
124
+ print(
125
+ f'{name} 注意:请下载市级地域报表,而不是省报表,数据不会保存,请重新下载!')
126
+ continue
127
+ for col_id in col_ids:
128
+ if col_id in sb:
129
+ df[col_id] = df[col_id].apply(
130
+ lambda x: f'="{x}"' if x and '=' not in str(x) else x
131
+ )
132
+ date_min = f'_{df["日期"].values.min()}_'
133
+ date_max = f'{df["日期"].values.max()}.csv'
134
+ if '万里马' in name:
135
+ tm_s_name = pattern[0] + shop_name + date_min + date_max
136
+ new_root_p = pathlib.Path(self.source_path, '推广报表', tg_name) # 文件夹,未包括文件名
137
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
138
+ self.save_to_csv(df, new_root_p, tm_s_name)
139
+ if self.set_up_to_mogo:
140
+ d.df_to_mongo(df=df, db_name='天猫数据1', collection_name=f'天猫_推广_{tg_name}')
141
+ if self.set_up_to_mysql:
142
+ m.df_to_mysql(df=df, db_name='天猫数据1', tabel_name=f'天猫_推广_{tg_name}')
143
+ os.remove(os.path.join(root, name))
144
+ else:
145
+ print(f'{name} 文件名不含"万里马", 不属于爬虫下载,您可以手动进行分类,但不会上传数据库')
146
+
147
+ if name.endswith('.csv') and '超级直播' in name:
148
+ # 超级直播
149
+ df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
150
+ if len(df) == 0:
151
+ print(f'{name} 报表数据为空')
152
+ continue
153
+ pattern = re.findall(r'(.*_)\d{8}_\d{6}', name)
154
+ shop_name = re.findall(r'\d{8}_\d{6}_(.*)\W', name)
155
+ if shop_name:
156
+ shop_name = shop_name[0]
157
+ else:
158
+ shop_name = ''
159
+ cols = ['场景ID', '计划ID']
160
+ for col in cols:
161
+ df[col] = df[col].apply(lambda x: f'="{x}"' if x and '=' not in str(x) else x)
162
+ df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
163
+ root_new = pathlib.Path(self.source_path, '推广报表', '超级直播')
164
+ date_min = f'_{df["日期"].values.min()}_' # 仅适用于日期列未转换之前, 还是整数,转换后不能用这个函数
165
+ date_max = f'{df["日期"].values.max()}.csv'
166
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
167
+ new_name = pattern[0] + shop_name + date_min + date_max
168
+ self.save_to_csv(df, root_new, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
169
+ if self.set_up_to_mogo:
170
+ d.df_to_mongo(df=df, db_name='天猫数据1', collection_name='天猫_推广_超级直播')
171
+ if self.set_up_to_mysql:
172
+ m.df_to_mysql(df=df, db_name='天猫数据1', tabel_name='天猫_推广_超级直播')
173
+ os.remove(os.path.join(root, name))
174
+ elif name.endswith('.xls') and '短直联投' in name:
175
+ # 短直联投
176
+ df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
177
+ df = pd.concat(df)
178
+ if len(df) == 0:
179
+ print(f'{name} 报表数据为空')
180
+ continue
181
+ new_name2 = os.path.splitext(name)[0] + '.csv'
182
+ df['订单Id'] = df['订单Id'].apply(
183
+ lambda x: "{0}{1}{2}".format('="', x, '"') if x and '=' not in str(x) else x
184
+ )
185
+ root_new = pathlib.Path(self.source_path, '推广报表/短直联投')
186
+ self.save_to_csv(df, root_new, new_name2) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
187
+ if self.set_up_to_mogo:
188
+ d.df_to_mongo(df=df, db_name='天猫数据1', collection_name='天猫_推广_短直联投')
189
+ if self.set_up_to_mysql:
190
+ m.df_to_mysql(df=df, db_name='天猫数据1', tabel_name='天猫_推广_短直联投')
191
+ os.remove(os.path.join(root, name))
192
+ elif name.endswith('.xls') and '视频加速推广' in name:
193
+ # 超级短视频
194
+ df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
195
+ df = pd.concat(df)
196
+ if len(df) == 0:
197
+ print(f'{name} 报表数据为空')
198
+ continue
199
+ new_name2 = os.path.splitext(name)[0] + '.csv'
200
+ df['计划ID'] = df['计划ID'].apply(
201
+ lambda x: "{0}{1}{2}".format('="', x, '"') if x and '=' not in str(x) else x
202
+ )
203
+ df['视频id'] = df['视频id'].apply(
204
+ lambda x: "{0}{1}{2}".format('="', x, '"') if x and '=' not in str(x) else x
205
+ )
206
+ root_new = pathlib.Path(self.source_path, '推广报表/超级短视频')
207
+ self.save_to_csv(df, root_new, new_name2) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
208
+ if self.set_up_to_mogo:
209
+ d.df_to_mongo(df=df, db_name='天猫数据1', collection_name='天猫_推广_超级短视频')
210
+ if self.set_up_to_mysql:
211
+ m.df_to_mysql(df=df, db_name='天猫数据1', tabel_name='天猫_推广_超级短视频')
212
+ os.remove(os.path.join(root, name))
213
+ if '人群报表汇总' in name:
214
+ df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=1, na_filter=False)
215
+ if len(df) == 0:
216
+ print(f'{name} 报表数据为空')
217
+ continue
218
+ min_clm = df.min()['日期']
219
+ max_clm = df.max()['日期']
220
+ new_name = '{}{}{}'.format(min_clm, '_', max_clm)
221
+ df['点击率'] = df['点击率'].apply(lambda x: format(x, '.2%') if x > 0 else '') # 格式化成百分比
222
+ df['UV点击率'] = df['UV点击率'].apply(lambda x: format(x, '.2%') if x > 0 else '')
223
+ df['收藏加购率'] = df['收藏加购率'].apply(lambda x: format(x, '.2%') if x > 0 else '')
224
+ df['UV收藏加购率'] = df['UV收藏加购率'].apply(lambda x: format(x, '.2%') if x > 0 else '')
225
+ df['点击转化率'] = df['点击转化率'].apply(lambda x: format(x, '.2%') if x > 0 else '')
226
+ df['UV点击转化率'] = df['UV点击转化率'].apply(lambda x: format(x, '.2%') if x > 0 else '')
227
+ df.replace(to_replace=[0], value='', regex=False, inplace=True)
228
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
229
+ df.to_csv(os.path.join(self.path, 'DMP报表_' + new_name + '.csv'), encoding='utf-8_sig',
230
+ index=False, header=True)
231
+ if self.set_up_to_mogo:
232
+ d.df_to_mongo(df=df, db_name='天猫数据1', collection_name='天猫_达摩盘_DMP报表',)
233
+ if self.set_up_to_mysql:
234
+ m.df_to_mysql(df=df, db_name='天猫数据1', tabel_name='天猫_达摩盘_DMP报表')
235
+ os.remove(os.path.join(root, name))
236
+ # ----------------- 推广报表 分割线 -----------------
237
+ # ----------------- 推广报表 分割线 -----------------
238
+
239
+ date01 = re.findall(r'(\d{4}-\d{2}-\d{2})_\d{4}-\d{2}-\d{2}', str(name))
240
+ date02 = re.findall(r'\d{4}-\d{2}-\d{2}_(\d{4}-\d{2}-\d{2})', str(name))
241
+ if name.endswith('.xls') and '生意参谋' in name and '无线店铺流量来源' in name:
242
+ # 无线店铺流量来源
243
+ new_name = os.path.splitext(name)[0] + '.csv'
244
+ df = pd.read_excel(os.path.join(root, name), header=5)
245
+ if len(df) == 0:
246
+ print(f'{name} 报表数据为空')
247
+ continue
248
+ df.replace(to_replace=['-'], value='', regex=False, inplace=True)
249
+ if date01[0] != date02[0]:
250
+ data_lis = date01[0] + '_' + date02[0]
251
+ df.insert(loc=0, column='数据周期', value=data_lis)
252
+ df.insert(loc=0, column='日期', value=date01[0])
253
+ # 2024-2-19 官方更新了推广渠道来源名称
254
+ df['三级来源'] = df['三级来源'].apply(
255
+ lambda x: '精准人群推广' if x == '精准人群推广(原引力魔方)'
256
+ else '关键词推广' if x == '关键词推广(原直通车)'
257
+ else '智能场景' if x == '智能场景(原万相台)'
258
+ else x
259
+ )
260
+ # df = df[df['访客数'] != '0']
261
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
262
+ for col in df.columns.tolist():
263
+ df[col] = df[col].apply(lambda x: 0 if not x else 0 if x == '' else x)
264
+ if '经营优势' in df['一级来源'].tolist(): # 新版流量
265
+ new_name = re.sub(r'\s?\(.*\)', '', new_name) # 删除小括号
266
+ new_name = os.path.splitext(new_name)[0] + '_新版.csv'
267
+
268
+ self.save_to_csv(df, root, new_name) # 因为 mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
269
+ if '经营优势' in df['一级来源'].tolist(): # 新版流量
270
+ if '数据周期' in df.columns.tolist():
271
+ if self.set_up_to_mogo:
272
+ d.df_to_mongo(df=df, db_name='生意参谋数据1', collection_name='生意参谋_店铺来源_月数据_新版')
273
+ if self.set_up_to_mysql:
274
+ m.df_to_mysql(df=df, db_name='生意参谋数据1', tabel_name='生意参谋_店铺来源_月数据_新版')
275
+ else:
276
+ if self.set_up_to_mogo:
277
+ d.df_to_mongo(df=df, db_name='生意参谋数据1', collection_name='生意参谋_店铺来源_日数据_新版')
278
+ if self.set_up_to_mysql:
279
+ m.df_to_mysql(df=df, db_name='生意参谋数据1', tabel_name='生意参谋_店铺来源_日数据_新版')
280
+ else: # 旧版流量
281
+ if '数据周期' in df.columns.tolist():
282
+ if self.set_up_to_mogo:
283
+ d.df_to_mongo(df=df, db_name='生意参谋数据1', collection_name='生意参谋_店铺来源_月数据')
284
+ if self.set_up_to_mysql:
285
+ m.df_to_mysql(df=df, db_name='生意参谋数据1', tabel_name='生意参谋_店铺来源_月数据')
286
+ else:
287
+ if self.set_up_to_mogo:
288
+ d.df_to_mongo(df=df, db_name='生意参谋数据1', collection_name='生意参谋_店铺来源_日数据')
289
+ if self.set_up_to_mysql:
290
+ m.df_to_mysql(df=df, db_name='生意参谋数据1', tabel_name='生意参谋_店铺来源_日数据')
291
+ os.remove(os.path.join(root, name))
292
+
293
+ elif name.endswith('.xls') and '生意参谋' in name and '商品_全部' in name:
294
+ # 店铺商品排行
295
+ new_name = os.path.splitext(name)[0] + '.csv'
296
+ df = pd.read_excel(os.path.join(root, name), header=4)
297
+ if len(df) == 0:
298
+ print(f'{name} 报表数据为空')
299
+ continue
300
+ df.replace(to_replace=['-'], value='', regex=False, inplace=True)
301
+ df['商品ID'] = df['商品ID'].apply(
302
+ lambda x: "{0}{1}{2}".format('="', x, '"') if x and '=' not in str(x) else x
303
+ )
304
+ df['货号'] = df['货号'].apply(
305
+ lambda x: "{0}{1}{2}".format('="', x, '"') if x and '=' not in str(x) else x
306
+ )
307
+ df.rename(columns={'统计日期': '日期', '商品ID': '商品id'}, inplace=True)
308
+ if date01[0] != date02[0]:
309
+ data_lis = date01[0] + '_' + date02[0]
310
+ df.insert(loc=1, column='数据周期', value=data_lis)
311
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
312
+ self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
313
+ if self.set_up_to_mogo:
314
+ d.df_to_mongo(df=df, db_name='生意参谋数据1', collection_name='生意参谋_商品排行')
315
+ if self.set_up_to_mysql:
316
+ m.df_to_mysql(df=df, db_name='生意参谋数据1', tabel_name='生意参谋_商品排行')
317
+ os.remove(os.path.join(root, name))
318
+
319
+ elif name.endswith('.xls') and '参谋店铺整体日报' in name:
320
+ # 自助取数,店铺日报
321
+ new_name = os.path.splitext(name)[0] + '.csv'
322
+ df = pd.read_excel(os.path.join(root, name), header=7)
323
+ if len(df) == 0:
324
+ print(f'{name} 报表数据为空')
325
+ continue
326
+ df.rename(columns={'统计日期': '日期'}, inplace=True)
327
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
328
+ self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
329
+ if self.set_up_to_mogo:
330
+ d.df_to_mongo(df=df,db_name='生意参谋数据1', collection_name='生意参谋_自助取数_整体日报')
331
+ if self.set_up_to_mysql:
332
+ m.df_to_mysql(df=df, db_name='生意参谋数据1', tabel_name='生意参谋_自助取数_整体日报')
333
+ os.remove(os.path.join(root, name))
334
+
335
+ elif name.endswith('.xls') and '参谋每日流量_自助取数_新版' in name:
336
+ # 自助取数,每日流量
337
+ new_name = os.path.splitext(name)[0] + '.csv'
338
+ df = pd.read_excel(os.path.join(root, name), header=7)
339
+ if len(df) == 0:
340
+ print(f'{name} 报表数据为空')
341
+ continue
342
+ df.rename(columns={'统计日期': '日期'}, inplace=True)
343
+ # 2024-2-19 官方更新了推广渠道来源名称,自助取数没有更新,这里强制更改
344
+ df['三级来源'] = df['三级来源'].apply(
345
+ lambda x: '精准人群推广' if x == '引力魔方'
346
+ else '关键词推广' if x == '直通车'
347
+ else '智能场景' if x == '万相台'
348
+ else '精准人群推广' if x == '精准人群推广(原引力魔方)'
349
+ else '关键词推广' if x == '关键词推广(原直通车)'
350
+ else '智能场景' if x == '智能场景(原万相台)'
351
+ else x
352
+ )
353
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
354
+ self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
355
+ if self.set_up_to_mogo:
356
+ d.df_to_mongo(df=df, db_name='生意参谋数据1', collection_name='生意参谋_自助取数_每日流量')
357
+ if self.set_up_to_mysql:
358
+ m.df_to_mysql(df=df, db_name='生意参谋数据1', tabel_name='生意参谋_自助取数_每日流量')
359
+ os.remove(os.path.join(root, name))
360
+
361
+ elif name.endswith('.xls') and '商品sku' in name:
362
+ # 自助取数,商品sku
363
+ new_name = os.path.splitext(name)[0] + '.csv'
364
+ df = pd.read_excel(os.path.join(root, name), header=7)
365
+ if len(df) == 0:
366
+ print(f'{name} 报表数据为空')
367
+ continue
368
+ df.rename(columns={
369
+ '统计日期': '日期',
370
+ '商品ID': '商品id',
371
+ 'SKU ID': 'sku id',
372
+ '商品SKU': '商品sku',
373
+ }, inplace=True)
374
+ for _i in ['商品id', 'sku id']:
375
+ df[_i] = df[_i].astype(str).apply(lambda x: f'="{x}"')
376
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
377
+ self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
378
+ if self.set_up_to_mogo:
379
+ d.df_to_mongo(df=df, db_name='生意参谋数据1', collection_name='生意参谋_自助取数_商品sku')
380
+ if self.set_up_to_mysql:
381
+ m.df_to_mysql(df=df, db_name='生意参谋数据1', tabel_name='生意参谋_自助取数_商品sku')
382
+ os.remove(os.path.join(root, name))
383
+
384
+ elif name.endswith('.xls') and '参谋店铺流量来源(月)' in name:
385
+ # 自助取数,月店铺流量来源
386
+ new_name = os.path.splitext(name)[0] + '.csv'
387
+ df = pd.read_excel(os.path.join(root, name), header=7)
388
+ if len(df) == 0:
389
+ print(f'{name} 报表数据为空')
390
+ continue
391
+ df.rename(columns={'统计日期': '数据周期'}, inplace=True)
392
+ # 2024-2-19 官方更新了推广渠道来源名称,自助取数没有更新,这里强制更改
393
+ df['三级来源'] = df['三级来源'].apply(
394
+ lambda x: '精准人群推广' if x == '引力魔方'
395
+ else '关键词推广' if x == '直通车'
396
+ else '智能场景' if x == '万相台'
397
+ else '精准人群推广' if x == '精准人群推广(原引力魔方)'
398
+ else '关键词推广' if x == '关键词推广(原直通车)'
399
+ else '智能场景' if x == '智能场景(原万相台)'
400
+ else x
401
+ )
402
+ df['日期'] = df['数据周期'].apply(lambda x: re.findall('(.*) ~', x)[0])
403
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
404
+ self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
405
+ if self.set_up_to_mogo:
406
+ d.df_to_mongo(df=df, db_name='生意参谋数据1', collection_name='生意参谋_自助取数_店铺流量_月数据')
407
+ if self.set_up_to_mysql:
408
+ m.df_to_mysql(df=df, db_name='生意参谋数据1', tabel_name='生意参谋_自助取数_店铺流量_月数据')
409
+ os.remove(os.path.join(root, name))
410
+
411
+ elif name.endswith('.csv') and 'baobei' in name:
412
+ # 生意经宝贝指标日数据
413
+ # print(name)
414
+ date = re.findall(r's-(\d{4})(\d{2})(\d{2})\.', str(name))
415
+ if not date: # 阻止月数据及已转换的表格
416
+ print(f'{name} 不支持或是已转换的表格')
417
+ os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
418
+ continue
419
+ df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
420
+ if len(df) == 0:
421
+ print(f'{name} 报表数据为空')
422
+ continue
423
+ if '日期' in df.columns.tolist():
424
+ df.pop('日期')
425
+ new_date = '-'.join(date[0])
426
+ df.insert(loc=0, column='日期', value=new_date)
427
+ df.replace(to_replace=['--'], value='', regex=False, inplace=True)
428
+ df['宝贝ID'] = df['宝贝ID'].apply(
429
+ lambda x: f'="{x}"' if x and '=' not in str(x) else x
430
+ )
431
+ df['商家编码'] = df['商家编码'].apply(
432
+ lambda x: f'="{x}"' if x and '=' not in str(x) else x
433
+ )
434
+ name_st = re.findall(r'(.*)\d{4}\d{2}\d{2}\.', str(name)) # baobeitrans-
435
+ new_name = f'{name_st[0]}{new_date}.csv'
436
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
437
+ self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
438
+ if self.set_up_to_mogo:
439
+ d.df_to_mongo(df=df, db_name='生意经1', collection_name='生意经_宝贝指标')
440
+ if self.set_up_to_mysql:
441
+ m.df_to_mysql(df=df, db_name='生意经1', tabel_name='生意经_宝贝指标')
442
+ os.remove(os.path.join(root, name))
443
+
444
+ elif name.endswith('.csv') and '店铺销售指标' in name:
445
+ # 生意经, 店铺指标,仅限月数据,实际日指标也可以
446
+ name_st = re.findall(r'(.*)\(分日', name)
447
+ if not name_st:
448
+ print(f'{name} 已转换的表格')
449
+ continue
450
+ df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
451
+ if len(df) == 0:
452
+ print(f'{name} 报表数据为空')
453
+ continue
454
+ df['日期'] = df['日期'].astype(str).apply(
455
+ lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', x)[0]) if x else x)
456
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore') # 转换日期列
457
+ # min_clm = str(df.min()['日期']).split(' ')[0]
458
+ # max_clm = str(df.max()['日期']).split(' ')[0]
459
+ min_clm = str(df['日期'].min()).split(' ')[0]
460
+ max_clm = str(df['日期'].max()).split(' ')[0]
461
+ new_name = f'{name_st[0]}-{min_clm}_{max_clm}.csv' # 保存时将(分日)去掉
462
+ df.replace(to_replace=['--'], value='', regex=False, inplace=True)
463
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
464
+ self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
465
+ if self.set_up_to_mogo:
466
+ d.df_to_mongo(df=df, db_name='生意经1', collection_name='生意经_店铺指标')
467
+ if self.set_up_to_mysql:
468
+ m.df_to_mysql(df=df, db_name='生意经1', tabel_name='生意经_店铺指标')
469
+ os.remove(os.path.join(root, name))
470
+
471
+ elif name.endswith('csv') and '省份' in name:
472
+ # 生意经,地域分布, 仅限日数据
473
+ pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})\.', name)
474
+ if not pattern or '省份城市分析2' not in name:
475
+ print(f'{name} 不支持或已转换的表格')
476
+ os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
477
+ continue
478
+ date = pattern[0][1:]
479
+ date = '-'.join(date)
480
+ new_name = f'{pattern[0][0]}-{date}.csv'
481
+ df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
482
+ if len(df) == 0:
483
+ print(f'{name} 报表数据为空')
484
+ continue
485
+ df['省'] = df['省份'].apply(lambda x: x if ' ├─ ' not in x and ' └─ ' not in x else None)
486
+ df['城市'] = df[['省份', '省']].apply(lambda x: '汇总' if x['省'] else x['省份'], axis=1)
487
+ df['省'].fillna(method='ffill', inplace=True)
488
+ df['城市'].replace(to_replace=[' ├─ | └─ '], value='', regex=True, inplace=True)
489
+ pov = df.pop('省')
490
+ city = df.pop('城市')
491
+ df['省+市'] = df['省份']
492
+ df['省份'] = pov
493
+ df.insert(loc=1, column='城市', value=city)
494
+ df.insert(loc=0, column='日期', value=date)
495
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
496
+ self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
497
+ if self.set_up_to_mogo:
498
+ d.df_to_mongo(df=df, db_name='生意经1', collection_name='生意经_地域分布_省份城市分析')
499
+ if self.set_up_to_mysql:
500
+ m.df_to_mysql(df=df, db_name='生意经1', tabel_name='生意经_地域分布_省份城市分析')
501
+ os.remove(os.path.join(root, name)) # 移除已转换的原文件
502
+
503
+ elif name.endswith('csv') and 'order' in name:
504
+ # 生意经,订单数据,仅限月数据
505
+ pattern = re.findall(r'(.*)(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})', name)
506
+ if not pattern:
507
+ print(f'{name} 不支持或已转换的表格')
508
+ os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
509
+ continue
510
+ date1 = pattern[0][1:4]
511
+ date1 = '-'.join(date1)
512
+ date2 = pattern[0][4:]
513
+ date2 = '-'.join(date2)
514
+ date = f'{date1}_{date2}'
515
+ new_name = f'{pattern[0][0]}{date}.csv'
516
+ df = pd.read_csv(os.path.join(root, name), encoding='gb18030', header=0, na_filter=False)
517
+ if len(df) == 0:
518
+ print(f'{name} 报表数据为空')
519
+ continue
520
+ df.insert(loc=0, column='日期', value=date1)
521
+ df.insert(loc=1, column='数据周期', value=date)
522
+ df['商品id'] = df['宝贝链接'].apply(
523
+ lambda x: f'=\"{"".join(re.findall("id=(.*)", str(x))[0])}\"' if x else x)
524
+ df.rename(columns={'宝贝标题': '商品标题', '宝贝链接': '商品链接'}, inplace=True)
525
+ df['颜色编码'] = df['商家编码'].apply(
526
+ lambda x: ''.join(re.findall(r' .*(\d{4})$', str(x))) if x else x)
527
+ df['商家编码'] = df['商家编码'].apply(lambda x: f'="{x}"' if x else x)
528
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
529
+ self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
530
+ if self.set_up_to_mogo:
531
+ d.df_to_mongo(df=df, db_name='生意经1', collection_name='生意经_订单数据')
532
+ if self.set_up_to_mysql:
533
+ m.df_to_mysql(df=df, db_name='生意经1', tabel_name='生意经_订单数据')
534
+ os.remove(os.path.join(root, name)) # 移除已转换的原文件
535
+
536
+ elif name.endswith('.xlsx') and '直播间成交订单明细' in name:
537
+ # 直播间成交订单明细
538
+ df = pd.read_excel(os.path.join(root, name), header=0)
539
+ if len(df) == 0:
540
+ print(f'{name} 报表数据为空')
541
+ continue
542
+ df.rename(columns={'场次ID': '场次id', '商品ID': '商品id'}, inplace=True)
543
+ df.replace(to_replace=['-'], value='', regex=False, inplace=True)
544
+ cols = ['开播时间', '下单时间', '支付时间', '确认收货时间']
545
+ for col in cols:
546
+ df[col] = pd.to_datetime(df[col]) # 转换日期列
547
+ for col2 in ['支付金额', '确认收货金额']:
548
+ df[col2] = pd.to_numeric(df[col2], errors='ignore')
549
+ df['日期'] = df['支付时间'].apply(lambda x: x.strftime('%Y-%m-%d'))
550
+ date_min = df['日期'].values.min() + '_'
551
+ date_max = df['日期'].values.max()
552
+ new_name = '直播间成交订单明细_' + date_min + date_max + '.csv'
553
+ for col3 in ['场次id', '商品id', '父订单', '子订单']:
554
+ df[col3] = df[col3].apply(
555
+ lambda x: "{0}{1}{2}".format('="', x, '"') if x and '=' not in str(x) else x
556
+ )
557
+ col4 = ['日期', '直播标题', '开播时间', '场次id', '支付时间', '支付金额', '商品id', '商品标题',
558
+ '商品一级类目', '父订单', '子订单', '下单时间', '确认收货时间', '确认收货金额']
559
+ df_lin = df[col4]
560
+ # 调整列顺序
561
+ df = pd.merge(df_lin, df, how='outer', on=col4)
562
+ self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
563
+ if self.set_up_to_mogo:
564
+ d.df_to_mongo(df=df, db_name='生意参谋数据1', collection_name='生意参谋_直播间成交订单明细')
565
+ if self.set_up_to_mysql:
566
+ m.df_to_mysql(df=df, db_name='生意参谋数据1', tabel_name='生意参谋_直播间成交订单明细')
567
+ os.remove(os.path.join(root, name))
568
+
569
+ elif name.endswith('.xlsx') and '直播间大盘数据' in name:
570
+ # 直播间大盘数据
571
+ df = pd.read_excel(os.path.join(root, name), header=0)
572
+ if len(df) == 0:
573
+ print(f'{name} 报表数据为空')
574
+ continue
575
+ df.replace(to_replace=['-'], value='', regex=False, inplace=True)
576
+ df.rename(columns={'统计日期': '日期'}, inplace=True)
577
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
578
+ df['日期'] = df['日期'].apply(lambda x: x.strftime('%Y-%m-%d'))
579
+ date_min = df['日期'].values.min() + '_'
580
+ date_max = df['日期'].values.max()
581
+ new_name = '直播间大盘数据_' + date_min + date_max + '.csv'
582
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
583
+ self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
584
+ if self.set_up_to_mogo:
585
+ d.df_to_mongo(df=df, db_name='生意参谋数据1', collection_name='生意参谋_直播间大盘数据')
586
+ if self.set_up_to_mysql:
587
+ m.df_to_mysql(df=df, db_name='生意参谋数据1', tabel_name='生意参谋_直播间大盘数据')
588
+ os.remove(os.path.join(root, name))
589
+
590
+ elif name.endswith('.xls') and '直播业绩-成交拆解' in name:
591
+ # 直播业绩-成交拆解
592
+ df = pd.read_excel(os.path.join(root, name), header=5)
593
+ if len(df) == 0:
594
+ print(f'{name} 报表数据为空')
595
+ continue
596
+ df.replace(to_replace=['-'], value='', regex=False, inplace=True)
597
+ df.replace(to_replace=[','], value='', regex=True, inplace=True)
598
+ df.rename(columns={'统计日期': '日期'}, inplace=True)
599
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
600
+ df['日期'] = df['日期'].apply(lambda x: x.strftime('%Y-%m-%d'))
601
+ date_min = df['日期'].values.min() + '_'
602
+ date_max = df['日期'].values.max()
603
+ new_name = '直播业绩_成交拆解_' + date_min + date_max + '.csv'
604
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
605
+ self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
606
+ if self.set_up_to_mogo:
607
+ d.df_to_mongo(df=df, db_name='生意参谋数据1', collection_name='生意参谋_直播业绩')
608
+ if self.set_up_to_mysql:
609
+ m.df_to_mysql(df=df, db_name='生意参谋数据1', tabel_name='生意参谋_直播业绩')
610
+ os.remove(os.path.join(root, name))
611
+
612
+ elif name.endswith('.xlsx') and '明星店铺' in name:
613
+ # 品销宝
614
+ pattern = re.findall(r'_(\d{4}-\d{2}-\d{2})_', name)
615
+ if pattern:
616
+ continue
617
+ sheets4 = ['账户', '推广计划', '推广单元', '创意', '品牌流量包', '定向人群'] # 品销宝
618
+ file_name4 = os.path.splitext(name)[0] # 明星店铺报表
619
+ for sheet4 in sheets4:
620
+ df = pd.read_excel(os.path.join(root, name), sheet_name=sheet4, header=0, engine='openpyxl')
621
+ # print(sheet4)
622
+ if len(df) == 0:
623
+ print(f'{name} 报表数据为空')
624
+ continue
625
+ if len(df) < 1:
626
+ print(f'{name} 跳过')
627
+ continue
628
+ else:
629
+ df.insert(loc=1, column='报表类型', value=sheet4)
630
+ df.fillna(0, inplace=True)
631
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore') # 转换日期列
632
+ min_clm = str(df['日期'].min()).split(' ')[0]
633
+ max_clm = str(df['日期'].max()).split(' ')[0]
634
+ new_file_name4 = f'{sheet4}_{file_name4}_{min_clm}_{max_clm}.csv'
635
+ # 以sheet名进一步创建子文件夹
636
+ root_new = str(pathlib.Path(self.source_path, '推广报表/品销宝', sheet4))
637
+ self.save_to_csv(df, root_new, new_file_name4) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
638
+ if self.set_up_to_mogo:
639
+ d.df_to_mongo(df=df, db_name='天猫数据1', collection_name='天猫_推广_品销宝')
640
+ if self.set_up_to_mysql:
641
+ m.df_to_mysql(df=df, db_name='天猫数据1', tabel_name='天猫_推广_品销宝')
642
+ os.remove(os.path.join(root, name))
643
+
644
+ elif name.endswith('.csv') and '淘宝店铺数据' in name:
645
+ df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
646
+ if self.set_up_to_mogo:
647
+ d.df_to_mongo(df=df, db_name='市场数据1', collection_name='淘宝店铺数据')
648
+ if self.set_up_to_mysql:
649
+ m.df_to_mysql(df=df, db_name='市场数据1', tabel_name='淘宝店铺数据')
650
+
651
+ elif name.endswith('.csv') and '人群洞察' in name:
652
+ df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
653
+ df.replace(to_replace=['--'], value='', regex=False, inplace=True)
654
+ df = df[df['人群规模'] != '']
655
+ if len(df) == 0:
656
+ os.remove(os.path.join(root, name))
657
+ print(f'{name}: 数据为空, 已移除: {os.path.join(root, name)}')
658
+ continue
659
+ if self.set_up_to_mogo:
660
+ d.df_to_mongo(df=df, db_name='天猫数据1', collection_name='万相台_人群洞察')
661
+ if self.set_up_to_mysql:
662
+ m.df_to_mysql(df=df, db_name='天猫数据1', tabel_name='万相台_人群洞察')
663
+
664
+ # ----------------------- 京东数据处理分界线 -----------------------
665
+ elif name.endswith('.xlsx') and '店铺来源_流量来源' in name:
666
+ # 京东店铺来源
667
+ if '按天' not in name:
668
+ print(f'{name} 京东流量请按天下载')
669
+ continue
670
+ new_name = name.split(r'__20')[0]
671
+ date01 = re.findall(r'(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
672
+ new_date01 = f'{date01[0][0]}-{date01[0][1]}-{date01[0][2]}'
673
+ new_date02 = f'{date01[0][3]}-{date01[0][4]}-{date01[0][5]}'
674
+ new_date03 = f'{new_date01}_{new_date02}'
675
+ df = pd.read_excel(os.path.join(root, name), header=0)
676
+ if len(df) == 0:
677
+ print(f'{name} 报表数据为空')
678
+ continue
679
+ df.replace(to_replace=['-'], value='', regex=False, inplace=True)
680
+ df.insert(loc=0, column='日期', value=new_date01)
681
+ if new_date01 != new_date02:
682
+ df.insert(loc=1, column='数据周期', value=new_date03)
683
+ cols = df.columns.tolist()
684
+ if '三级来源' in cols:
685
+ source = '三级来源'
686
+ elif '二级来源' in cols:
687
+ source = '二级来源'
688
+ else:
689
+ source = '一级来源'
690
+
691
+ new_name = f'{new_name}_{source}_{new_date03}.csv'
692
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
693
+ self.save_to_csv(df, root, new_name) # csv 文件仍然保留这些列
694
+ for col_2024 in cols: # 京东这个表有字段加了去年日期,删除这些同比数据字段,不然列数量爆炸
695
+ if '20' in col_2024 and '流量来源' in name:
696
+ df.drop(col_2024, axis=1, inplace=True)
697
+ if self.set_up_to_mogo:
698
+ d.df_to_mongo(df=df,db_name='京东数据1', collection_name='京东_流量来源_日数据')
699
+ if self.set_up_to_mysql:
700
+ m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_流量来源_日数据')
701
+ os.remove(os.path.join(root, name))
702
+
703
+ elif name.endswith('.xlsx') and '全部渠道_商品明细' in name:
704
+ # 京东商品明细 文件转换
705
+ date1 = re.findall(r'_(\d{4})(\d{2})(\d{2})_全部', str(name))
706
+ if not date1[0]:
707
+ print(f'{name}: 仅支持日数据')
708
+ continue
709
+ if date1:
710
+ date1 = f'{date1[0][0]}-{date1[0][1]}-{date1[0][2]}'
711
+ df = pd.read_excel(os.path.join(root, name), header=0)
712
+ if len(df) == 0:
713
+ print(f'{name} 报表数据为空')
714
+ continue
715
+ if '10035975359247' in df['商品ID'].values or '10056642622343' in df['商品ID'].values:
716
+ new_name = f'sku_{date1}_全部渠道_商品明细.csv'
717
+ elif '10021440233518' in df['商品ID'].values or '10022867813485' in df['商品ID'].values:
718
+ new_name = f'spu_{date1}_全部渠道_商品明细.csv'
719
+ else:
720
+ new_name = f'未分类_{date1}_全部渠道_商品明细.csv'
721
+ df.replace(to_replace=['-'], value='', regex=False, inplace=True)
722
+ df.rename(columns={'商品ID': '商品id'}, inplace=True)
723
+ df['商品id'] = df['商品id'].apply(lambda x: f'="{x}"' if x else x)
724
+ df['货号'] = df['货号'].apply(lambda x: f'="{x}"' if x else x)
725
+ df.insert(loc=0, column='日期', value=date1)
726
+
727
+ self.save_to_csv(df, root, new_name)
728
+ if self.set_up_to_mogo:
729
+ if 'sku' in new_name:
730
+ d.df_to_mongo(df=df,db_name='京东数据1', collection_name='京东_sku_商品明细')
731
+ elif 'spu' in new_name:
732
+ d.df_to_mongo(df=df,db_name='京东数据1', collection_name='京东_spu_商品明细')
733
+ if self.set_up_to_mysql:
734
+ if 'sku' in new_name:
735
+ m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_sku_商品明细')
736
+ elif 'spu' in new_name:
737
+ m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_spu_商品明细')
738
+ os.remove(os.path.join(root, name))
739
+ elif name.endswith('.xlsx') and '搜索分析-排名定位-商品词下排名' in name:
740
+ # 京东商品词下排名
741
+ new_name = os.path.splitext(name)[0] + '.csv'
742
+ # print(name)
743
+ df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
744
+ if len(df) == 0:
745
+ print(f'{name} 报表数据为空')
746
+ continue
747
+ df.rename(columns={'商品的ID': 'skuid'}, inplace=True)
748
+ df['skuid'] = df['skuid'].apply(lambda x: f'="{x}"' if x and '=' not in str(x) else x)
749
+ self.save_to_csv(df, root, new_name)
750
+ if self.set_up_to_mogo:
751
+ d.df_to_mongo(df=df,db_name='京东数据1', collection_name='京东_商品词下排名')
752
+ if self.set_up_to_mysql:
753
+ m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_商品词下排名')
754
+ os.remove(os.path.join(root, name)) # 移除已转换的原文件
755
+
756
+ elif name.endswith('.xlsx') and '搜索分析-排名定位-商品排名' in name:
757
+ # 京东商品排名
758
+ new_name = os.path.splitext(name)[0] + '.csv'
759
+ date_in = re.findall(r'(\d{4}-\d{2}-\d{2})-搜索', str(name))[0]
760
+ df = pd.read_excel(os.path.join(root, name), header=0)
761
+ if len(df) == 0:
762
+ print(f'{name} 报表数据为空')
763
+ continue
764
+ df.insert(0, '日期', date_in) # 插入新列
765
+ df.rename(columns={'SKU': 'skuid'}, inplace=True)
766
+ df['skuid'] = df['skuid'].apply(lambda x: f'="{x}"' if x and '=' not in str(x) else x)
767
+ self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
768
+ if self.set_up_to_mogo:
769
+ d.df_to_mongo(df=df,db_name='京东数据1', collection_name='京东_商品排名')
770
+ if self.set_up_to_mysql:
771
+ m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_商品排名')
772
+ os.remove(os.path.join(root, name)) # 移除已转换的原文件
773
+
774
+ elif name.endswith('.xls') and '竞店概况_竞店详情' in name:
775
+ # 京东,竞争-竞店概况-竞店详情-全部渠道
776
+ date01 = re.findall(r'全部渠道_(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
777
+ start_date = f'{date01[0][0]}-{date01[0][1]}-{date01[0][2]}'
778
+ end_date = f'{date01[0][3]}-{date01[0][4]}-{date01[0][5]}'
779
+ df = pd.read_excel(os.path.join(root, name), header=0)
780
+ if len(df) == 0:
781
+ print(f'{name} 报表数据为空')
782
+ continue
783
+ df.replace(to_replace=[','], value='', regex=True, inplace=True)
784
+ df.insert(loc=0, column='日期', value=start_date)
785
+ new_name = f'{os.path.splitext(name)[0]}'
786
+ new_name = re.sub(r'\d{8}_\d{8}', f'{start_date}_{end_date}', new_name)
787
+ self.save_to_csv(df, root, new_name)
788
+ if self.set_up_to_mogo:
789
+ d.df_to_mongo(df=df,db_name='京东数据1', collection_name='京东_竞店监控_日数据')
790
+ if self.set_up_to_mysql:
791
+ m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_竞店监控_日数据')
792
+ os.remove(os.path.join(root, name))
793
+
794
+ elif name.endswith('.xls') and '店铺' in name:
795
+ # 京东 自助报表 店铺日报
796
+ df = pd.read_excel(os.path.join(root, name), header=0)
797
+ if len(df) == 0:
798
+ print(f'{name} 报表数据为空')
799
+ continue
800
+ df['日期'] = df['日期'].apply(
801
+ lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', str(x))[0])
802
+ )
803
+ date_min = df['日期'].values.min()
804
+ date_max = df['日期'].values.max()
805
+ # df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
806
+ new_name = f'JD店铺日报_' + re.findall(r"(.*)\d{8}_\d{8}", name)[0] + f'_{date_min}_{date_max}.csv'
807
+ self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
808
+ if self.set_up_to_mogo:
809
+ d.df_to_mongo(df=df, db_name='京东数据1', collection_name='京东_自助取数_店铺日报')
810
+ if self.set_up_to_mysql:
811
+ m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_自助取数_店铺日报')
812
+ os.remove(os.path.join(root, name))
813
+
814
+ elif name.endswith('.xls') and '商家榜单_女包_整体' in name:
815
+ # 京东 行业 商家榜单
816
+ date2 = re.findall(r'_\d{8}-\d+', name)
817
+ if date2:
818
+ print(f'{name}: 请下载日数据,不支持其他周期')
819
+ os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
820
+ continue
821
+ date1 = re.findall(r'_(\d{4})(\d{2})(\d{2})', name)
822
+ date1 = f'{date1[0][0]}-{date1[0][1]}-{date1[0][2]}'
823
+ df = pd.read_excel(os.path.join(root, name), header=0)
824
+ if len(df) == 0:
825
+ print(f'{name} 报表数据为空')
826
+ continue
827
+ df['日期'] = df['日期'].astype(str).apply(lambda x: f'{x[:4]}-{x[4:6]}-{x[6:8]}')
828
+ df.insert(loc=0, column='类型', value='商家榜单')
829
+ new_name = f'{os.path.splitext(name)[0]}_{date1}.csv'
830
+ self.save_to_csv(df, root, new_name)
831
+ if self.set_up_to_mogo:
832
+ d.df_to_mongo(df=df,db_name='京东数据1', collection_name='京东_商家榜单')
833
+ if self.set_up_to_mysql:
834
+ m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_商家榜单')
835
+ os.remove(os.path.join(root, name))
836
+
837
+ elif name.endswith('.xlsx') and '批量SKU导出-批量任务' in name:
838
+ # 京东 sku 导出
839
+ df = pd.read_excel(os.path.join(root, name), header=0)
840
+ if len(df) == 0:
841
+ print(f'{name} 报表数据为空')
842
+ continue
843
+ d_time = datetime.datetime.today().strftime('%Y-%m-%d')
844
+ df.insert(loc=0, column='日期', value=d_time)
845
+ for col in ['SKUID', '商品编码', '商家SKU', '货号']:
846
+ df[col] = df[col].apply(lambda x: f'="{x}"' if x else x)
847
+ df['商品链接'] = df['商品链接'].apply(lambda x: f'https://{x}' if x else x)
848
+ new_name = f'京东商品信息_{os.path.splitext(name)[0]}_{d_time}.csv'
849
+ self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
850
+ if self.set_up_to_mogo:
851
+ d.df_to_mongo(df=df, db_name='属性设置1', collection_name='京东商品信息')
852
+ if self.set_up_to_mysql:
853
+ m.df_to_mysql(df=df, db_name='属性设置1', tabel_name='京东商品信息')
854
+ os.remove(os.path.join(root, name))
855
+
856
+ elif name.endswith('.xlsx') and '批量SPU导出-批量任务' in name:
857
+ # 京东 spu 导出
858
+ df = pd.read_excel(os.path.join(root, name), header=0)
859
+ if len(df) == 0:
860
+ print(f'{name} 报表数据为空')
861
+ continue
862
+ d_time = datetime.datetime.today().strftime('%Y-%m-%d')
863
+ df.insert(loc=0, column='日期', value=d_time)
864
+ for col in ['商品编码', '货号']:
865
+ df[col] = df[col].apply(lambda x: f'="{x}"' if x else x)
866
+ new_name = f'京东商品信息_{os.path.splitext(name)[0]}_{d_time}.csv'
867
+
868
+ self.save_to_csv(df, root, new_name)
869
+ os.remove(os.path.join(root, name))
870
+
871
+ elif name.endswith('.csv') and '万里马箱包推广1_完整点击成交' in name:
872
+ # 京东推广数据
873
+ df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
874
+ if len(df) == 0:
875
+ print(f'{name} 报表数据为空')
876
+ continue
877
+ pic_list = df['日期'].tolist()
878
+ pic = []
879
+ for i in pic_list:
880
+ pics = re.findall(pattern=r'(\d{4})(\d{2})(\d{2})', string=str(i))
881
+ if pics:
882
+ pics = '-'.join(pics[0])
883
+ pic.append(pics)
884
+ else:
885
+ pic.append(i)
886
+ df['日期'] = pd.Series(pic)
887
+ date_min = df['日期'].values.min() + '_'
888
+ date_max = df['日期'].values.max()
889
+ new_name2 = '京东点击成交报表_' + date_min + date_max + '.csv'
890
+ for col in ['计划ID', '触发SKU ID', '跟单SKU ID', 'SPU ID']:
891
+ df[col] = df[col].astype(str).apply(lambda x: f'="{x}"' if x and '=' not in x else x)
892
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
893
+ self.save_to_csv(df, root, new_name2) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
894
+ if self.set_up_to_mogo:
895
+ d.df_to_mongo(df=df, db_name='京东数据1', collection_name='京东_推广_京准通')
896
+ if self.set_up_to_mysql:
897
+ m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_推广_京准通')
898
+ os.remove(os.path.join(root, name))
899
+ elif name.endswith('.csv') and '万里马箱包推广1_京东推广搜索词_pbix同步不要' in name:
900
+ df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
901
+ if len(df) == 0:
902
+ print(f'{name} 报表数据为空')
903
+ continue
904
+ pic_list = df['日期'].tolist()
905
+ pic = []
906
+ for i in pic_list:
907
+ pics = re.findall(pattern=r'(\d{4})(\d{2})(\d{2})', string=str(i))
908
+ if pics:
909
+ pics = '-'.join(pics[0])
910
+ pic.append(pics)
911
+ else:
912
+ pic.append(i)
913
+ df['日期'] = pd.Series(pic)
914
+ date_min = df['日期'].values.min() + '_'
915
+ date_max = df['日期'].values.max()
916
+ new_name2 = '京东推广搜索词_' + date_min + date_max + '.csv'
917
+ df.replace(to_replace=[0], value='', regex=False, inplace=True)
918
+ df['是否品牌词'] = df['搜索词'].str.contains('万里马|wanlima', regex=True)
919
+ df['是否品牌词'] = df['是否品牌词'].apply(lambda x: '品牌词' if x else '')
920
+ self.save_to_csv(df, root, new_name2) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
921
+ if self.set_up_to_mogo:
922
+ d.df_to_mongo(df=df, db_name='京东数据1', collection_name='京东_推广_搜索词报表')
923
+ if self.set_up_to_mysql:
924
+ m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_推广_搜索词报表')
925
+ os.remove(os.path.join(root, name))
926
+
927
+ elif name.endswith('.xlsx') and '零售明细统计' in name:
928
+ #
929
+ df = pd.read_excel(os.path.join(root, name), header=0)
930
+ if len(df) == 0:
931
+ print(f'{name} 报表数据为空')
932
+ continue
933
+ df['摘要'] = df['摘要'].apply(lambda x: re.sub('\'', '', str(x)) if x else x)
934
+ for col in ['原单号', '商品代码', '摘要']:
935
+ df[col] = df[col].apply(lambda x: f'="{re.sub(".0", "", str(x))}"' if x else x)
936
+ df = df[df['缩略图'] != '合计']
937
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
938
+ date_min = f'_{re.sub("T.*", "", str(df["日期"].values.min()))}_'
939
+ date_max = f'{re.sub("T.*", "", str(df["日期"].values.max()))}.csv'
940
+ new_name = re.findall(r'(.*)_\d{4}-\d{2}-\d{2}', name)[0]
941
+ new_name = f'{new_name}{date_min}{date_max}'
942
+ self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
943
+ if self.set_up_to_mogo:
944
+ d.df_to_mongo(df=df, db_name='生意经1', collection_name='E3_零售明细统计')
945
+ if self.set_up_to_mysql:
946
+ m.df_to_mysql(df=df, db_name='生意经1', tabel_name='E3_零售明细统计')
947
+ os.remove(os.path.join(root, name))
948
+ if self.set_up_to_mogo:
949
+ if d.client:
950
+ d.client.close() # 必须手动关闭数据库连接
951
+
952
+ """
953
+ {文件分类}
954
+ 将已处理完的文件 分类移到原始文件夹下
955
+ 此处t_path参数定义了子文件夹的生成名称
956
+ """
957
+
958
+ @staticmethod
959
+ def move_files(path, _name, target_path, _as_month=None):
960
+ """
961
+ name: 移动的文件名,
962
+ target_path: 目标位置
963
+ """
964
+ t2 = target_path # t2 赋值有用, 不能省略
965
+ if not os.path.exists(t2): # 如果目录不存在则创建
966
+ os.makedirs(t2, exist_ok=True)
967
+ if _as_month:
968
+ _date = re.findall(r'(\d{4}-\d{2})-\d{2}', str(_name))
969
+ if _date:
970
+ _date = _date[0]
971
+ t2 = pathlib.Path(t2, _date) # 添加 年月分类
972
+ if not os.path.exists(t2):
973
+ os.makedirs(t2, exist_ok=True)
974
+ old_file = os.path.join(t2, _name) # 检查目标位置是否已经存在该文件
975
+ if os.path.isfile(old_file):
976
+ os.remove(old_file) # 如果存在则移除
977
+ shutil.move(os.path.join(path, _name), t2) # 将文件从下载文件夹移到目标位置
978
+
979
+ # @try_except
980
+ def move_all(self, path=None):
981
+ if not path:
982
+ path = self.path
983
+ for root, dirs, files in os.walk(path, topdown=False):
984
+ for name in files:
985
+ def bib(paths, _as_month=None):
986
+ """闭包函数"""
987
+ self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
988
+
989
+ if name.endswith('.csv') and '无线店铺流量来源' in name:
990
+ date01 = re.findall(r'\d{4}-\d{2}-(\d{2})_\d{4}-\d{2}-(\d{2})', name)
991
+ if int(date01[0][1]) - int(date01[0][0]) > 15:
992
+ t_path = str(pathlib.Path(self.source_path, '月数据/流量来源'))
993
+ bib(t_path)
994
+ else:
995
+ t_path = str(pathlib.Path(self.source_path, '生意参谋/流量来源'))
996
+ bib(t_path, _as_month=True)
997
+ elif name.endswith('.csv') and '商品_全部' in name:
998
+ t_path = str(pathlib.Path(self.source_path, '生意参谋/商品排行'))
999
+ bib(t_path, _as_month=True)
1000
+ elif name.endswith('.csv') and '参谋店铺整体日报' in name:
1001
+ t_path = str(pathlib.Path(self.source_path, '生意参谋/全店数据-自助取数'))
1002
+ bib(t_path, _as_month=True)
1003
+ elif name.endswith('.csv') and '参谋每日流量_自助取数' in name:
1004
+ t_path = str(pathlib.Path(self.source_path, '生意参谋/流量来源-自助取数'))
1005
+ bib(t_path, _as_month=True)
1006
+ elif name.endswith('.csv') and '商品sku' in name:
1007
+ t_path = str(pathlib.Path(self.source_path, '生意参谋/商品sku-自助取数'))
1008
+ bib(t_path, _as_month=True)
1009
+ elif name.endswith('.csv') and '参谋店铺流量来源(月)' in name:
1010
+ t_path = str(pathlib.Path(self.source_path, '月数据/流量来源-自助取数-月数据'))
1011
+ bib(t_path, _as_month=True)
1012
+ elif name.endswith('.csv') and '竞店分析-' in name:
1013
+ t_path = str(pathlib.Path(self.source_path, '市场数据/竞店分析'))
1014
+ bib(t_path, _as_month=True)
1015
+ elif name.endswith('.csv') and '监控店铺数据' in name:
1016
+ t_path = str(pathlib.Path(self.source_path, '市场数据/监控店铺数据'))
1017
+ bib(t_path, _as_month=True)
1018
+ elif name.endswith('.csv') and '监控商品' in name:
1019
+ t_path = str(pathlib.Path(self.source_path, '市场数据/监控商品数据'))
1020
+ bib(t_path, _as_month=True)
1021
+ elif name.endswith('.csv') and '竞店分析-流量分析' in name:
1022
+ t_path = str(pathlib.Path(self.source_path, '市场数据/竞店流量构成'))
1023
+ bib(t_path, _as_month=True)
1024
+ elif name.endswith('.csv') and '类目洞察' in name:
1025
+ t_path = str(pathlib.Path(self.source_path, '市场数据/类目洞察'))
1026
+ bib(t_path, _as_month=True)
1027
+ elif name.endswith('.csv') and '市场排行_店铺排行' in name:
1028
+ t_path = str(pathlib.Path(self.source_path, '市场数据/市场二级类目店铺'))
1029
+ bib(t_path, _as_month=True)
1030
+ elif name.endswith('.csv') and 'baobei' in name:
1031
+ date = re.findall(r's-(\d{4})-(\d{2})-(\d{2})\.', str(name))
1032
+ if not date: # 阻止月数据及未转换的表格
1033
+ continue
1034
+ t_path = str(pathlib.Path(self.source_path, '生意经/宝贝指标'))
1035
+ bib(t_path, _as_month=True)
1036
+ elif name.endswith('.csv') and '省份城市分析' in name:
1037
+ date = re.findall(r'(\d{4})-(\d{2})-(\d{2})\.', str(name))
1038
+ if not date: # 阻止未转换的表格
1039
+ continue
1040
+ t_path = str(pathlib.Path(self.source_path, '生意经/地域分布'))
1041
+ bib(t_path, _as_month=True)
1042
+ elif name.endswith('.csv') and '店铺销售指标' in name:
1043
+ date = re.findall(r'(\d{4})-(\d{2})-(\d{2})\.', str(name))
1044
+ if not date: # 阻止未转换的表格
1045
+ continue
1046
+ t_path = str(pathlib.Path(self.source_path, '生意经/店铺指标'))
1047
+ bib(t_path, _as_month=False)
1048
+ elif name.endswith('.csv') and 'order' in name:
1049
+ date = re.findall(r'(\d{4})-(\d{2})-(\d{2})\.', str(name))
1050
+ if not date: # 阻止未转换的表格
1051
+ continue
1052
+ t_path = str(pathlib.Path(self.source_path, '生意经/订单数据'))
1053
+ bib(t_path, _as_month=False)
1054
+ elif name.endswith('.csv') and '直播间成交订单明细' in name:
1055
+ t_path = str(pathlib.Path(self.source_path, '生意参谋/直播订单明细'))
1056
+ bib(t_path, _as_month=True)
1057
+ elif name.endswith('.csv') and '直播间大盘数据' in name:
1058
+ t_path = str(pathlib.Path(self.source_path, '生意参谋/直播间大盘数据'))
1059
+ bib(t_path, _as_month=True)
1060
+ elif name.endswith('.csv') and '直播业绩_成交拆解' in name:
1061
+ t_path = str(pathlib.Path(self.source_path, '生意参谋/直播业绩_成交拆解'))
1062
+ bib(t_path, _as_month=True)
1063
+ elif name.endswith('.csv') and 'DMP报表' in name:
1064
+ t_path = str(pathlib.Path(self.source_path, '推广报表/DMP报表'))
1065
+ bib(t_path, _as_month=True)
1066
+ elif name.endswith('.csv') and '人群洞察' in name:
1067
+ t_path = str(pathlib.Path(self.source_path, '推广报表/人群洞察'))
1068
+ bib(t_path, _as_month=True)
1069
+ elif name.endswith('.csv') and '客户_客户概况_画像' in name:
1070
+ t_path = str(pathlib.Path(self.source_path, '生意参谋/客户_客户概况_画像'))
1071
+ bib(t_path, _as_month=True)
1072
+ elif name.endswith('.csv') and '市场排行_店铺' in name:
1073
+ t_path = str(pathlib.Path(self.source_path, '市场数据/市场排行'))
1074
+ bib(t_path, _as_month=True)
1075
+ elif name.endswith('.csv') and '淘宝店铺数据' in name:
1076
+ t_path = str(pathlib.Path(self.source_path, '市场数据/其他数据'))
1077
+ bib(t_path, _as_month=False)
1078
+ elif name.endswith('.csv') and '零售明细统计' in name:
1079
+ t_path = str(pathlib.Path(self.source_path, '生意经/E3零售明细统计'))
1080
+ bib(t_path, _as_month=True)
1081
+ # 京东分界线 ------- 开始标记
1082
+ # 京东分界线
1083
+ elif name.endswith('.csv') and '全部渠道_商品明细' in name:
1084
+ if 'sku' in name:
1085
+ t_path = str(pathlib.Path(self.source_path, '京东报表/JD商品明细sku'))
1086
+ elif 'spu' in name:
1087
+ t_path = str(pathlib.Path(self.source_path, '京东报表/JD商品明细spu'))
1088
+ else:
1089
+ t_path = str(pathlib.Path(self.source_path, '京东报表/未找到分类数据'))
1090
+ bib(t_path, _as_month=True)
1091
+ elif name.endswith('.csv') and '竞店概况_竞店详情' in name:
1092
+ t_path = str(pathlib.Path(self.source_path, '京东报表/JD竞店监控数据'))
1093
+ bib(t_path, _as_month=True)
1094
+ elif name.endswith('.csv') and '京东推广搜索词' in name:
1095
+ t_path = str(pathlib.Path(self.source_path, '京东报表/JD推广搜索词报表'))
1096
+ bib(t_path, _as_month=True)
1097
+ elif name.endswith('.csv') and '京东点击成交报表' in name:
1098
+ t_path = str(pathlib.Path(self.source_path, '京东报表/JD推广报表'))
1099
+ bib(t_path, _as_month=True)
1100
+ elif name.endswith('.csv') and '搜索分析-排名定位-商品词下排名' in name:
1101
+ t_path = str(pathlib.Path(self.source_path, '京东报表/JD排名定位/商品词下排名'))
1102
+ bib(t_path, _as_month=True)
1103
+ elif name.endswith('.csv') and '搜索分析-排名定位-商品排名' in name:
1104
+ t_path = str(pathlib.Path(self.source_path, '京东报表/JD排名定位/商品排名'))
1105
+ bib(t_path, _as_month=True)
1106
+ elif name.endswith('.csv') and '按天_店铺来源_流量来源' in name:
1107
+ t_path = str(pathlib.Path(self.source_path, '京东报表/JD流量来源'))
1108
+ bib(t_path, _as_month=True)
1109
+ elif name.endswith('.csv') and 'JD店铺日报' in name:
1110
+ t_path = str(pathlib.Path(self.source_path, '京东报表/JD店铺日报'))
1111
+ bib(t_path, _as_month=True)
1112
+ elif name.endswith('.csv') and '商家榜单_女包_整体' in name:
1113
+ t_path = str(pathlib.Path(self.source_path, '京东报表/JD商家榜单'))
1114
+ bib(t_path, _as_month=True)
1115
+ elif name.endswith('.csv') and '导出-批量任务' in name:
1116
+ t_path = str(pathlib.Path(self.source_path, '京东报表/商品信息导出'))
1117
+ bib(t_path, _as_month=False)
1118
+ elif name.endswith('.csv') and '_行业分析_竞争分析' in name:
1119
+ t_path = str(pathlib.Path(self.source_path, '京东报表/行业竞争分析'))
1120
+ bib(t_path, _as_month=True)
1121
+ elif name.endswith('.csv') and '付费广告_行业分析_行业大盘' in name:
1122
+ t_path = str(pathlib.Path(self.source_path, '京东报表/行业大盘_流量排行'))
1123
+ bib(t_path, _as_month=False)
1124
+ # 京东分界线 ------- 结束标记
1125
+
1126
+ def attribute(self, path=None, _str='商品素材导出', ):
1127
+ """
1128
+ 从天猫商品素材库中下载的文件,将文件修改日期添加到DF 和文件名中
1129
+ """
1130
+ db_name = '属性设置2'
1131
+ collection_name = '商品素材导出'
1132
+ if not path:
1133
+ path = self.path
1134
+
1135
+ if self.set_up_to_mogo:
1136
+ username, password, host, port = get_myconf.select_config_values(target_service='home_lx',
1137
+ database='mongodb')
1138
+ d = mongo.UploadMongo(username=username, password=password, host=host, port=port,
1139
+ drop_duplicates=False
1140
+ )
1141
+ if self.set_up_to_mysql:
1142
+ username, password, host, port = get_myconf.select_config_values(target_service='home_lx', database='mysql')
1143
+ m = mysql.MysqlUpload(username=username, password=password, host=host, port=port)
1144
+ new_save_path = os.path.join(self.source_path, '属性设置', '商品素材')
1145
+ for root, dirs, files in os.walk(path, topdown=False):
1146
+ for name in files:
1147
+ if name.endswith('.xlsx') and '~' not in name:
1148
+ pattern = re.findall('([\u4e00-\u9fa5])', name)
1149
+ if pattern:
1150
+ continue
1151
+ if '~$' in name or 'DS_Store' in name:
1152
+ continue
1153
+ df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
1154
+ col = df.columns.tolist()
1155
+ if '商品白底图' in col and '方版场景图' in col:
1156
+ f_info = os.stat(os.path.join(root, name)) # 读取文件的 stat 信息
1157
+ mtime = time.strftime('%Y-%m-%d', time.localtime(f_info.st_mtime)) # 读取文件创建日期
1158
+ df['日期'] = mtime
1159
+ df.rename(columns={'商品ID': '商品id'}, inplace=True)
1160
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
1161
+ if (652737455554 in df['商品id'].tolist()
1162
+ or 683449516249 in df['商品id'].tolist()
1163
+ or 37114359548 in df['商品id'].tolist()
1164
+ or 570735930393 in df['商品id'].tolist()):
1165
+ df.insert(0, '店铺名称', '万里马官方旗舰店') # 插入新列
1166
+ new_name = 'tm_' + os.path.splitext(name)[0] + f'_{_str}_' + mtime + '.csv'
1167
+ elif (704624764420 in df['商品id'].tolist()
1168
+ or 701781021639 in df['商品id'].tolist()
1169
+ or 520380314717 in df['商品id'].tolist()):
1170
+ df.insert(0, '店铺名称', '万里马官方企业店') # 插入新列
1171
+ new_name = 'tb_' + os.path.splitext(name)[0] + f'_{_str}_' + mtime + '.csv'
1172
+ else:
1173
+ df.insert(0, '店铺名称', 'coome旗舰店') # 插入新列
1174
+ new_name = 'coome_' + os.path.splitext(name)[0] + f'_{_str}_' + mtime + '.csv'
1175
+ df['商品id'] = df['商品id'].apply(
1176
+ lambda x: "{0}{1}{2}".format('="', x, '"') if x and '=' not in str(x) else x
1177
+ )
1178
+ # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
1179
+ self.save_to_csv(df, new_save_path, new_name, encoding='utf-8_sig')
1180
+ try:
1181
+ if self.set_up_to_mogo:
1182
+ d.df_to_mongo(df=df, db_name=db_name, collection_name=collection_name)
1183
+ if self.set_up_to_mysql:
1184
+ m.df_to_mysql(df=df, db_name=db_name, tabel_name=collection_name)
1185
+ except Exception as e:
1186
+ print(e)
1187
+ os.remove(os.path.join(root, name))
1188
+ if self.set_up_to_mogo:
1189
+ if d.client:
1190
+ d.client.close() # 必须手动关闭数据库连接
1191
+
1192
+ # @try_except
1193
+ def new_unzip(self, path=None, is_move=None):
1194
+ """
1195
+ {解压并移除zip文件}
1196
+ 如果是京东的商品明细,处理过程:
1197
+ 1. 读取 zip包的文件名
1198
+ 2. 组合完整路径,判断文件夹下是否已经有同名文件
1199
+ 3. 如果有,则将该同名文件改名,(从文件名中提取日期,重新拼接文件名)
1200
+ 4. 然后解压 zip包
1201
+ 5. 需要用 _jd_rename 继续重命名刚解压的文件
1202
+ is_move 参数, 是否移除 下载目录的所有zip 文件
1203
+ """
1204
+ if not path:
1205
+ path = self.path
1206
+ res_names = [] # 需要移除的压缩文件
1207
+ for root, dirs, files in os.walk(path, topdown=False):
1208
+ for name in files:
1209
+ if '~$' in name or 'DS_Store' in name:
1210
+ continue
1211
+ if name.endswith('.zip'):
1212
+ old_file = os.path.join(root, name)
1213
+ f = zipfile.ZipFile(old_file, 'r')
1214
+ if len(f.namelist()) == 1: # 压缩包只有一个文件的情况
1215
+ for zip_name in f.namelist(): # 读取zip内的文件名称
1216
+ # zip_name_1 = zip_name.encode('cp437').decode('utf-8')
1217
+ try:
1218
+ zip_name_1 = zip_name.encode('utf-8').decode('utf-8')
1219
+ except:
1220
+ zip_name_1 = zip_name.encode('cp437').decode('utf-8')
1221
+ new_path = os.path.join(root, zip_name_1) # 拼接解压后的文件路径
1222
+ if os.path.isfile(new_path) and '全部渠道_商品明细' in new_path: # 是否存在和包内同名的文件
1223
+ # 专门处理京东文件
1224
+ df = pd.read_excel(new_path)
1225
+ try:
1226
+ pattern1 = re.findall(r'\d{8}_(\d{4})(\d{2})(\d{2})_全部渠道_商品明细',
1227
+ name)
1228
+ pattern2 = re.findall(
1229
+ r'\d{8}_(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})_全部渠道_商品明细',
1230
+ name)
1231
+ if pattern1:
1232
+ year_date = '-'.join(list(pattern1[0])) + '_' + '-'.join(list(pattern1[0]))
1233
+ elif pattern2:
1234
+ year_date = '-'.join(list(pattern2[0])[0:3]) + '_' + '-'.join(
1235
+ list(pattern2[0])[3:7])
1236
+ else:
1237
+ year_date = '无法提取日期'
1238
+ print(f'{name} 无法从文件名中提取日期,请检查pattern或文件')
1239
+ if ('10035975359247' in df['商品ID'].values or '10056642622343' in
1240
+ df['商品ID'].values):
1241
+ os.rename(new_path,
1242
+ os.path.join(root, 'sku_' + year_date + '_全部渠道_商品明细.xls'))
1243
+ f.extract(zip_name_1, root)
1244
+ elif ('10021440233518' in df['商品ID'].values or '10022867813485' in
1245
+ df['商品ID'].values):
1246
+ os.rename(new_path,
1247
+ os.path.join(root, 'spu_' + year_date + '_全部渠道_商品明细.xls'))
1248
+ f.extract(zip_name_1, root)
1249
+ if is_move:
1250
+ os.remove(os.path.join(root, name))
1251
+ except Exception as e:
1252
+ print(e)
1253
+ continue
1254
+ else:
1255
+ f.extract(zip_name, root)
1256
+ if zip_name_1 != zip_name:
1257
+ os.rename(os.path.join(root, zip_name), os.path.join(root, zip_name_1))
1258
+ if is_move:
1259
+ res_names.append(name)
1260
+ # os.remove(os.path.join(root, name)) # 这里不能移除,会提示文件被占用
1261
+ f.close()
1262
+ else: # 压缩包内包含多个文件的情况
1263
+ f.close()
1264
+ self.unzip_all(path=old_file, save_path=path)
1265
+
1266
+ if is_move:
1267
+ for name in res_names:
1268
+ os.remove(os.path.join(path, name))
1269
+ print(f'移除{os.path.join(path, name)}')
1270
+
1271
+ @staticmethod
1272
+ def unzip_all(path, save_path):
1273
+ """
1274
+ 遍历目录, 重命名有乱码的文件
1275
+ 2. 如果压缩包是文件夹, 则保存到新文件夹,并删除有乱码的文件夹
1276
+ 3. 删除MAC系统的临时文件夹__MACOSX
1277
+ """
1278
+ with PyZipFile(path) as _f:
1279
+ _f.extractall(save_path)
1280
+ _f.close()
1281
+ for _root, _dirs, _files in os.walk(save_path, topdown=False):
1282
+ for _name in _files:
1283
+ if '~$' in _name or 'DS_Store' in _name:
1284
+ continue
1285
+ try:
1286
+ _new_root = _root.encode('cp437').decode('utf-8')
1287
+ _new_name = _name.encode('cp437').decode('utf-8')
1288
+ except:
1289
+ _new_root = _root.encode('utf-8').decode('utf-8')
1290
+ _new_name = _name.encode('utf-8').decode('utf-8')
1291
+ _old = os.path.join(_root, _name)
1292
+ _new = os.path.join(_new_root, _new_name)
1293
+ if _new_root != _root: # 目录乱码,创建新目录
1294
+ os.makedirs(_new_root, exist_ok=True)
1295
+ os.rename(_old, _new)
1296
+ try:
1297
+ _new_root = _root.encode('cp437').decode('utf-8')
1298
+ except:
1299
+ _new_root = _root.encode('utf-8').decode('utf-8')
1300
+ if _new_root != _root or '__MACOSX' in _root:
1301
+ shutil.rmtree(_root)
1302
+
1303
+
1304
+ def main():
1305
+ # 数据分类
1306
+
1307
+ d_path = '/Users/xigua/Downloads'
1308
+ source_path = '/Users/xigua/数据中心/原始文件2'
1309
+ c = DataClean(path=d_path, source_path=source_path)
1310
+ c.set_up_to_mogo = False
1311
+ c.set_up_to_mysql = False
1312
+ c.new_unzip(is_move=True) # 解压文件
1313
+ c.change_and_sort()
1314
+ # c.move_all() # 移到文件到原始文件夹
1315
+ # c.attribute() # 商品素材重命名和分类
1316
+
1317
+
1318
+ if __name__ == '__main__':
1319
+ # main()
1320
+ username, password, host, port = get_myconf.select_config_values(target_service='aliyun', database='mongodb')
1321
+ print(username, password, host, port)