mdbq 3.3.5__py3-none-any.whl → 3.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/clean/data_clean.py DELETED
@@ -1,1551 +0,0 @@
1
- # -*- coding:utf-8 -*-
2
- import warnings
3
- import pandas as pd
4
- from functools import wraps
5
- import chardet
6
- import zipfile
7
- from pyzipper import PyZipFile
8
- import os
9
- import platform
10
- import pathlib
11
- import json
12
- from mdbq.mongo import mongo
13
- from mdbq.mysql import mysql
14
- from mdbq.config import get_myconf
15
- import datetime
16
- import time
17
- import re
18
- import shutil
19
- import getpass
20
-
21
- warnings.filterwarnings('ignore')
22
-
23
-
24
- class DataClean:
25
- """ 数据分类 """
26
-
27
- def __init__(self, path, source_path):
28
- self.path = path
29
- self.source_path = source_path
30
- self.set_up_to_mogo: bool = True # 不设置则不上传 mongodb
31
- self.set_up_to_mysql: bool = True # 不设置则不上传 mysql
32
-
33
- def __call__(self, *args, **kwargs):
34
- self.new_unzip(path=self.path, is_move=True) # 解压文件
35
- self.change_and_sort(path=self.path)
36
-
37
- self.move_all(path=self.path) # 移到文件到原始文件夹
38
- self.attribute(path=self.path) # 商品素材重命名和分类
39
-
40
- @staticmethod
41
- def try_except(func): # 在类内部定义一个异常处理方法
42
- @wraps(func)
43
- def wrapper(*args, **kwargs):
44
- try:
45
- return func(*args, **kwargs)
46
- except Exception as e:
47
- print(f'{func.__name__}, {e}') # 将异常信息返回
48
-
49
- return wrapper
50
-
51
- @staticmethod
52
- def get_encoding(file_path):
53
- """
54
- 获取文件的编码方式, 读取速度比较慢,非必要不要使用
55
- """
56
- with open(file_path, 'rb') as f:
57
- f1 = f.read()
58
- encod = chardet.detect(f1).get('encoding')
59
- return encod
60
-
61
- @staticmethod
62
- def save_to_csv(_df, _save_paths, filenames, encoding='utf-8_sig'):
63
- if '.csv' not in filenames:
64
- filenames = f'{filenames}.csv'
65
- if not os.path.exists(_save_paths):
66
- os.makedirs(_save_paths, exist_ok=True)
67
- _df.to_csv(os.path.join(_save_paths, filenames), encoding=encoding, index=False, header=True)
68
-
69
- # @try_except
70
- def change_and_sort(self, path=None, is_except=[]):
71
- """数据转换"""
72
- if not path:
73
- path = self.path
74
-
75
- if self.set_up_to_mogo:
76
- username, password, host, port = get_myconf.select_config_values(target_service='home_lx',
77
- database='mongodb')
78
- d = mongo.UploadMongo(username=username, password=password, host=host, port=port,
79
- drop_duplicates=False
80
- )
81
- if self.set_up_to_mysql:
82
- username, password, host, port = get_myconf.select_config_values(target_service='home_lx', database='mysql')
83
- m = mysql.MysqlUpload(username=username, password=password, host=host, port=port)
84
-
85
- for root, dirs, files in os.walk(path, topdown=False):
86
- for name in files:
87
- if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
88
- continue
89
-
90
- is_continue = False
91
- if is_except:
92
- for item in is_except:
93
- if item in os.path.join(root, name):
94
- # print(name)
95
- is_continue = True
96
- break
97
- if is_continue: # 需要排除不做处理的文件或文件夹
98
- continue
99
-
100
- try:
101
- encoding = self.get_encoding(file_path=pathlib.Path(root, name))
102
- # ----------------- 推广报表 分割线 -----------------
103
- tg_names = [
104
- '账户报表', # 旧版,后来改成 营销场景报表了,C 店还是旧版
105
- '营销场景报表',
106
- '计划报表',
107
- '单元报表',
108
- '关键词报表',
109
- '人群报表',
110
- '主体报表',
111
- '其他主体报表',
112
- '创意报表',
113
- '地域报表',
114
- '权益报表',
115
- ]
116
- for tg_name in tg_names:
117
- if tg_name in name and '汇总' not in name and name.endswith('.csv'): # 人群报表排除达摩盘报表: 人群报表汇总
118
- pattern = re.findall(r'(.*_)\d{8}_\d{6}', name)
119
- if not pattern: # 说明已经转换过
120
- continue
121
- shop_name = re.findall(r'\d{8}_\d{6}_(.*)\W', name)
122
- if shop_name:
123
- shop_name = shop_name[0]
124
- else:
125
- shop_name = ''
126
- df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
127
- if '地域' not in name: # 除了地域报表, 检查数据的字段是否包含“场景名字”,如果没有,说明没有选“pbix” 数据模块下载
128
- ck = df.columns.tolist()
129
- if '场景名字' not in ck:
130
- print(f'{name} 报表字段缺失, 请选择Pbix数据模板下载')
131
- continue
132
- if len(df) == 0:
133
- print(f'{name} 报表是空的, 请重新下载, 此报表已移除')
134
- os.remove(os.path.join(root, name))
135
- continue
136
-
137
- df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
138
- df.fillna(0, inplace=True)
139
- col_ids = [
140
- # '场景ID', # 2024.10.5 改为不加 =""
141
- '计划ID',
142
- '单元ID',
143
- '主体ID',
144
- '宝贝ID',
145
- '词ID/词包ID',
146
- '创意ID',
147
- ]
148
- sb = df.columns.tolist()
149
- if '日期' not in sb:
150
- print(f'{name} 注意:该报表不包含分日数据,数据不会保存,请重新下载!')
151
- continue
152
- if '省' in sb:
153
- if '市' not in sb:
154
- print(
155
- f'{name} 注意:请下载市级地域报表,而不是省报表,数据不会保存,请重新下载!')
156
- continue
157
- for col_id in col_ids:
158
- if col_id in sb:
159
- df[col_id] = df[col_id].apply(
160
- lambda x: f'="{x}"' if x and '=' not in str(x) else x
161
- )
162
- date_min = f'_{df["日期"].values.min()}_'
163
- date_max = f'{df["日期"].values.max()}.csv'
164
- if '万里马' in name:
165
- tm_s_name = pattern[0] + shop_name + date_min + date_max
166
- if shop_name == '广东万里马':
167
- new_root_p = pathlib.Path(self.source_path, '推广报表_淘宝店', tg_name) # 文件夹,未包括文件名
168
- else:
169
- new_root_p = pathlib.Path(self.source_path, '推广报表', tg_name) # 文件夹,未包括文件名
170
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
171
- if '省' in df.columns.tolist() and '场景名字' in df.columns.tolist() and '完整' in name:
172
- if shop_name == '广东万里马':
173
- new_root_p = pathlib.Path(self.source_path, '推广报表_淘宝店', f'完整_{tg_name}')
174
- else:
175
- new_root_p = pathlib.Path(self.source_path, '推广报表', f'完整_{tg_name}')
176
- tm_s_name = f'完整_{tm_s_name}'
177
- self.save_to_csv(df, new_root_p, tm_s_name)
178
- # if self.set_up_to_mogo:
179
- # d.df_to_mongo(df=df, db_name='天猫数据1', collection_name=f'天猫_推广_{tg_name}')
180
- # if self.set_up_to_mysql:
181
- # m.df_to_mysql(df=df, db_name='天猫数据1', tabel_name=f'天猫_推广_{tg_name}')
182
- os.remove(os.path.join(root, name))
183
- else:
184
- print(f'{name} 文件名不含"万里马", 不属于爬虫下载,您可以手动进行分类,但不会上传数据库')
185
-
186
- if name.endswith('.csv') and '超级直播' in name:
187
- # 超级直播
188
- df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
189
- if len(df) == 0:
190
- print(f'{name} 报表数据为空')
191
- os.remove(os.path.join(root, name))
192
- continue
193
- pattern = re.findall(r'(.*_)\d{8}_\d{6}', name)
194
- shop_name = re.findall(r'\d{8}_\d{6}_(.*)\W', name)
195
- if shop_name:
196
- shop_name = shop_name[0]
197
- else:
198
- shop_name = ''
199
- cols = [
200
- # '场景ID', # 2024.10.5 改为不加 =""
201
- '计划ID',
202
- ]
203
- for col in cols:
204
- df[col] = df[col].apply(lambda x: f'="{x}"' if x and '=' not in str(x) else x)
205
- df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
206
- root_new = pathlib.Path(self.source_path, '推广报表', '超级直播')
207
- date_min = f'_{df["日期"].values.min()}_' # 仅适用于日期列未转换之前, 还是整数,转换后不能用这个函数
208
- date_max = f'{df["日期"].values.max()}.csv'
209
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
210
- new_name = pattern[0] + shop_name + date_min + date_max
211
- self.save_to_csv(df, root_new, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
212
- if self.set_up_to_mogo:
213
- d.df_to_mongo(df=df, db_name='天猫数据1', collection_name='天猫_推广_超级直播')
214
- if self.set_up_to_mysql:
215
- m.df_to_mysql(df=df, db_name='天猫数据1', tabel_name='天猫_推广_超级直播')
216
- os.remove(os.path.join(root, name))
217
- elif name.endswith('.xls') and '短直联投' in name:
218
- # 短直联投
219
- df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
220
- df = pd.concat(df)
221
- if len(df) == 0:
222
- print(f'{name} 报表数据为空')
223
- os.remove(os.path.join(root, name))
224
- continue
225
- new_name2 = os.path.splitext(name)[0] + '.csv'
226
- df['订单Id'] = df['订单Id'].apply(
227
- lambda x: "{0}{1}{2}".format('="', x, '"') if x and '=' not in str(x) else x
228
- )
229
- root_new = pathlib.Path(self.source_path, '推广报表/短直联投')
230
- self.save_to_csv(df, root_new, new_name2) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
231
- if self.set_up_to_mogo:
232
- d.df_to_mongo(df=df, db_name='天猫数据1', collection_name='天猫_推广_短直联投')
233
- if self.set_up_to_mysql:
234
- m.df_to_mysql(df=df, db_name='天猫数据1', tabel_name='天猫_推广_短直联投')
235
- os.remove(os.path.join(root, name))
236
- elif name.endswith('.xls') and '视频加速推广' in name:
237
- # 超级短视频
238
- df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
239
- df = pd.concat(df)
240
- if len(df) == 0:
241
- print(f'{name} 报表数据为空')
242
- os.remove(os.path.join(root, name))
243
- continue
244
- new_name2 = os.path.splitext(name)[0] + '.csv'
245
- df['计划ID'] = df['计划ID'].apply(
246
- lambda x: "{0}{1}{2}".format('="', x, '"') if x and '=' not in str(x) else x
247
- )
248
- df['视频id'] = df['视频id'].apply(
249
- lambda x: "{0}{1}{2}".format('="', x, '"') if x and '=' not in str(x) else x
250
- )
251
- root_new = pathlib.Path(self.source_path, '推广报表/超级短视频')
252
- self.save_to_csv(df, root_new, new_name2) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
253
- if self.set_up_to_mogo:
254
- d.df_to_mongo(df=df, db_name='天猫数据1', collection_name='天猫_推广_超级短视频')
255
- if self.set_up_to_mysql:
256
- m.df_to_mysql(df=df, db_name='天猫数据1', tabel_name='天猫_推广_超级短视频')
257
- os.remove(os.path.join(root, name))
258
- if '人群报表汇总' in name:
259
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=1, na_filter=False)
260
- if len(df) == 0:
261
- print(f'{name} 报表数据为空')
262
- os.remove(os.path.join(root, name))
263
- continue
264
- min_clm = df.min()['日期']
265
- max_clm = df.max()['日期']
266
- new_name = '{}{}{}'.format(min_clm, '_', max_clm)
267
- df['点击率'] = df['点击率'].apply(lambda x: format(x, '.2%') if x > 0 else '') # 格式化成百分比
268
- df['UV点击率'] = df['UV点击率'].apply(lambda x: format(x, '.2%') if x > 0 else '')
269
- df['收藏加购率'] = df['收藏加购率'].apply(lambda x: format(x, '.2%') if x > 0 else '')
270
- df['UV收藏加购率'] = df['UV收藏加购率'].apply(lambda x: format(x, '.2%') if x > 0 else '')
271
- df['点击转化率'] = df['点击转化率'].apply(lambda x: format(x, '.2%') if x > 0 else '')
272
- df['UV点击转化率'] = df['UV点击转化率'].apply(lambda x: format(x, '.2%') if x > 0 else '')
273
- df.replace(to_replace=[0], value='', regex=False, inplace=True)
274
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
275
- df.to_csv(os.path.join(self.path, 'DMP报表_' + new_name + '.csv'), encoding='utf-8_sig',
276
- index=False, header=True)
277
- if self.set_up_to_mogo:
278
- d.df_to_mongo(df=df, db_name='天猫数据1', collection_name='天猫_达摩盘_DMP报表',)
279
- if self.set_up_to_mysql:
280
- m.df_to_mysql(df=df, db_name='天猫数据1', tabel_name='天猫_达摩盘_DMP报表')
281
- os.remove(os.path.join(root, name))
282
- # ----------------- 推广报表 分割线 -----------------
283
- # ----------------- 推广报表 分割线 -----------------
284
-
285
- date01 = re.findall(r'(\d{4}-\d{2}-\d{2})_\d{4}-\d{2}-\d{2}', str(name))
286
- date02 = re.findall(r'\d{4}-\d{2}-\d{2}_(\d{4}-\d{2}-\d{2})', str(name))
287
- if name.endswith('.xls') and '生意参谋' in name and '无线店铺流量来源' in name:
288
- # 无线店铺流量来源
289
- new_name = os.path.splitext(name)[0] + '.csv'
290
- df = pd.read_excel(os.path.join(root, name), header=5)
291
- if len(df) == 0:
292
- print(f'{name} 报表数据为空')
293
- os.remove(os.path.join(root, name))
294
- continue
295
- df.replace(to_replace=['-'], value='', regex=False, inplace=True)
296
- if date01[0] != date02[0]:
297
- data_lis = date01[0] + '_' + date02[0]
298
- df.insert(loc=0, column='数据周期', value=data_lis)
299
- df.insert(loc=0, column='日期', value=date01[0])
300
- # 2024-2-19 官方更新了推广渠道来源名称
301
- # df['三级来源'] = df['三级来源'].apply(
302
- # lambda x: '精准人群推广' if x == '精准人群推广(原引力魔方)'
303
- # else '关键词推广' if x == '关键词推广(原直通车)'
304
- # else '智能场景' if x == '智能场景(原万相台)'
305
- # else x
306
- # )
307
- df['三级来源'] = df['三级来源'].apply(
308
- lambda x: re.sub('(.*)', '', str(x) if x else x)
309
- )
310
- # df = df[df['访客数'] != '0']
311
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
312
- for col in df.columns.tolist():
313
- df[col] = df[col].apply(lambda x: 0 if not x else 0 if x == '' else x)
314
- if '经营优势' in df['一级来源'].tolist(): # 新版流量
315
- new_name = re.sub(r'\s?\(.*\)', '', new_name) # 删除小括号
316
- new_name = os.path.splitext(new_name)[0] + '_新版.csv'
317
-
318
- self.save_to_csv(df, root, new_name) # 因为 mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
319
- if '经营优势' in df['一级来源'].tolist(): # 新版流量
320
- if '数据周期' in df.columns.tolist():
321
- if self.set_up_to_mogo:
322
- d.df_to_mongo(df=df, db_name='生意参谋2', collection_name='生意参谋_店铺来源_月数据')
323
- if self.set_up_to_mysql:
324
- m.df_to_mysql(df=df, db_name='生意参谋2', tabel_name='生意参谋_店铺来源_月数据')
325
- else:
326
- if self.set_up_to_mogo:
327
- d.df_to_mongo(df=df, db_name='生意参谋2', collection_name='生意参谋_店铺来源_日数据')
328
- if self.set_up_to_mysql:
329
- m.df_to_mysql(df=df, db_name='生意参谋2', tabel_name='生意参谋_店铺来源_日数据')
330
- else: # 旧版流量
331
- if '数据周期' in df.columns.tolist():
332
- if self.set_up_to_mogo:
333
- d.df_to_mongo(df=df, db_name='生意参谋2', collection_name='生意参谋_店铺来源_月数据_旧版')
334
- if self.set_up_to_mysql:
335
- m.df_to_mysql(df=df, db_name='生意参谋2', tabel_name='生意参谋_店铺来源_月数据_旧版')
336
- else:
337
- if self.set_up_to_mogo:
338
- d.df_to_mongo(df=df, db_name='生意参谋2', collection_name='生意参谋_店铺来源_日数据_旧版')
339
- if self.set_up_to_mysql:
340
- m.df_to_mysql(df=df, db_name='生意参谋2', tabel_name='生意参谋_店铺来源_日数据_旧版')
341
- os.remove(os.path.join(root, name))
342
-
343
- elif name.endswith('.xls') and '生意参谋' in name and '无线店铺三级流量来源详情' in name:
344
- # 店铺来源,手淘搜索,关键词
345
- pattern = re.findall(r'(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})', name)
346
- df = pd.read_excel(os.path.join(root, name), header=5)
347
- if len(df) == 0:
348
- print(f'{name} 报表数据为空')
349
- continue
350
- df.replace(to_replace=[','], value='', regex=True, inplace=True)
351
- df.insert(loc=0, column='日期', value=pattern[0][1])
352
- df.rename(columns={
353
- '来源名称': '关键词',
354
- '收藏商品-支付买家数': '收藏商品_支付买家数',
355
- '加购商品-支付买家数': '加购商品_支付买家数',
356
- }, inplace=True)
357
- if pattern[0][0] != pattern[0][1]:
358
- data_lis = pattern[0][0] + '_' + pattern[0][1]
359
- df.insert(loc=1, column='数据周期', value=data_lis)
360
- new_name = os.path.splitext(name)[0] + '.csv'
361
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
362
- os.remove(os.path.join(root, name))
363
-
364
- elif name.endswith('.xls') and '生意参谋' in name and '商品_全部' in name:
365
- # 店铺商品排行
366
- new_name = os.path.splitext(name)[0] + '.csv'
367
- df = pd.read_excel(os.path.join(root, name), header=4)
368
- if len(df) == 0:
369
- print(f'{name} 报表数据为空')
370
- os.remove(os.path.join(root, name))
371
- continue
372
- df.replace(to_replace=['-'], value='', regex=False, inplace=True)
373
- df['商品ID'] = df['商品ID'].apply(
374
- lambda x: "{0}{1}{2}".format('="', x, '"') if x and '=' not in str(x) else x
375
- )
376
- df['货号'] = df['货号'].apply(
377
- lambda x: "{0}{1}{2}".format('="', x, '"') if x and '=' not in str(x) else x
378
- )
379
- df.rename(columns={'统计日期': '日期', '商品ID': '商品id'}, inplace=True)
380
- if date01[0] != date02[0]:
381
- data_lis = date01[0] + '_' + date02[0]
382
- df.insert(loc=1, column='数据周期', value=data_lis)
383
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
384
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
385
- if self.set_up_to_mogo:
386
- d.df_to_mongo(df=df, db_name='生意参谋2', collection_name='生意参谋_商品排行')
387
- if self.set_up_to_mysql:
388
- m.df_to_mysql(df=df, db_name='生意参谋2', tabel_name='生意参谋_商品排行')
389
- os.remove(os.path.join(root, name))
390
-
391
- elif name.endswith('.xls') and '参谋店铺整体日报' in name:
392
- # 自助取数,店铺日报
393
- new_name = os.path.splitext(name)[0] + '.csv'
394
- df = pd.read_excel(os.path.join(root, name), header=7)
395
- if len(df) == 0:
396
- print(f'{name} 报表数据为空')
397
- os.remove(os.path.join(root, name))
398
- continue
399
- df.rename(columns={'统计日期': '日期'}, inplace=True)
400
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
401
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
402
- if self.set_up_to_mogo:
403
- d.df_to_mongo(df=df,db_name='生意参谋2', collection_name='生意参谋_自助取数_整体日报')
404
- if self.set_up_to_mysql:
405
- m.df_to_mysql(df=df, db_name='生意参谋2', tabel_name='生意参谋_自助取数_整体日报')
406
- os.remove(os.path.join(root, name))
407
-
408
- elif name.endswith('.xls') and '参谋每日流量_自助取数_新版' in name:
409
- # 自助取数,每日流量
410
- new_name = os.path.splitext(name)[0] + '.csv'
411
- df = pd.read_excel(os.path.join(root, name), header=7)
412
- if len(df) == 0:
413
- print(f'{name} 报表数据为空')
414
- os.remove(os.path.join(root, name))
415
- continue
416
- df.rename(columns={'统计日期': '日期'}, inplace=True)
417
- # 2024-2-19 官方更新了推广渠道来源名称,自助取数没有更新,这里强制更改
418
- df['三级来源'] = df['三级来源'].apply(
419
- lambda x: '精准人群推广' if x == '引力魔方'
420
- else '关键词推广' if x == '直通车'
421
- else '智能场景' if x == '万相台'
422
- else '精准人群推广' if x == '精准人群推广(原引力魔方)'
423
- else '关键词推广' if x == '关键词推广(原直通车)'
424
- else '智能场景' if x == '智能场景(原万相台)'
425
- else x
426
- )
427
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
428
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
429
- if self.set_up_to_mogo:
430
- d.df_to_mongo(df=df, db_name='生意参谋2', collection_name='生意参谋_自助取数_每日流量')
431
- if self.set_up_to_mysql:
432
- m.df_to_mysql(df=df, db_name='生意参谋2', tabel_name='生意参谋_自助取数_每日流量')
433
- os.remove(os.path.join(root, name))
434
-
435
- elif name.endswith('.xls') and '商品sku' in name:
436
- # 自助取数,商品sku
437
- new_name = os.path.splitext(name)[0] + '.csv'
438
- df = pd.read_excel(os.path.join(root, name), header=7)
439
- if len(df) == 0:
440
- print(f'{name} 报表数据为空')
441
- os.remove(os.path.join(root, name))
442
- continue
443
- df.rename(columns={
444
- '统计日期': '日期',
445
- '商品ID': '商品id',
446
- 'SKU ID': 'sku id',
447
- '商品SKU': '商品sku',
448
- }, inplace=True)
449
- for _i in ['商品id', 'sku id']:
450
- df[_i] = df[_i].astype(str).apply(lambda x: f'="{x}"')
451
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
452
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
453
- if self.set_up_to_mogo:
454
- d.df_to_mongo(df=df, db_name='生意参谋2', collection_name='生意参谋_自助取数_商品sku')
455
- if self.set_up_to_mysql:
456
- m.df_to_mysql(df=df, db_name='生意参谋2', tabel_name='生意参谋_自助取数_商品sku')
457
- os.remove(os.path.join(root, name))
458
-
459
- elif name.endswith('.xls') and '参谋店铺流量来源(月)' in name:
460
- # 自助取数,月店铺流量来源
461
- new_name = os.path.splitext(name)[0] + '.csv'
462
- df = pd.read_excel(os.path.join(root, name), header=7)
463
- if len(df) == 0:
464
- print(f'{name} 报表数据为空')
465
- os.remove(os.path.join(root, name))
466
- continue
467
- df.rename(columns={'统计日期': '数据周期'}, inplace=True)
468
- # 2024-2-19 官方更新了推广渠道来源名称,自助取数没有更新,这里强制更改
469
- df['三级来源'] = df['三级来源'].apply(
470
- lambda x: '精准人群推广' if x == '引力魔方'
471
- else '关键词推广' if x == '直通车'
472
- else '智能场景' if x == '万相台'
473
- else '精准人群推广' if x == '精准人群推广(原引力魔方)'
474
- else '关键词推广' if x == '关键词推广(原直通车)'
475
- else '智能场景' if x == '智能场景(原万相台)'
476
- else x
477
- )
478
- df['日期'] = df['数据周期'].apply(lambda x: re.findall('(.*) ~', x)[0])
479
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
480
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
481
- if self.set_up_to_mogo:
482
- d.df_to_mongo(df=df, db_name='生意参谋2', collection_name='生意参谋_自助取数_店铺流量_月数据')
483
- if self.set_up_to_mysql:
484
- m.df_to_mysql(df=df, db_name='生意参谋2', tabel_name='生意参谋_自助取数_店铺流量_月数据')
485
- os.remove(os.path.join(root, name))
486
- elif name.endswith('.xlsx') and '直播分场次效果' in name:
487
- pattern = re.findall(r'(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})', name)
488
- if pattern:
489
- continue
490
- df = pd.read_excel(os.path.join(root, name), header=0)
491
- if len(df) == 0:
492
- print(f'{name} 报表数据为空')
493
- continue
494
- df.replace(to_replace=['--'], value='0', regex=False, inplace=True)
495
- df.replace(to_replace=[','], value='', regex=True, inplace=True)
496
- df['直播开播时间'] = pd.to_datetime(df['直播开播时间'], format='%Y-%m-%d %H:%M:%S', errors='ignore')
497
- df.insert(loc=0, column='日期', value=df['直播开播时间'])
498
- df['日期'] = df['日期'].apply(lambda x: pd.to_datetime(str(x).split(' ')[0], format='%Y-%m-%d', errors='ignore') if x else x)
499
- df.insert(loc=1, column='店铺', value='万里马官方旗舰店')
500
- min_clm = str(df.min()['直播开播时间']).split(' ')[0]
501
- max_clm = str(df.max()['直播开播时间']).split(' ')[0]
502
- new_name = f'{os.path.splitext(name)[0]}_{min_clm}_{max_clm}.csv'
503
- new_name = re.sub(r' ?(\(\d+\))', '',new_name)
504
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
505
- os.remove(os.path.join(root, name))
506
- elif name.endswith('.csv') and '分天数据-计划_活动类型-推广概览-数据汇总' in name:
507
- df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
508
- df['日期'].replace(to_replace=['\\t'], value='', regex=True, inplace=True)
509
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
510
- min_clm = str(df['日期'].min()).split(' ')[0]
511
- max_clm = str(df['日期'].max()).split(' ')[0]
512
- new_name = f'淘宝联盟_分天数据_计划_活动类型_推广概览_数据汇总_{min_clm}_{max_clm}'
513
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
514
- os.remove(os.path.join(root, name))
515
- elif name.endswith('.csv') and 'baobei' in name:
516
- # 生意经宝贝指标日数据
517
- # print(name)
518
- date = re.findall(r's-(\d{4})(\d{2})(\d{2})\.', str(name))
519
- if not date: # 阻止月数据及已转换的表格
520
- print(f'{name} 不支持或是已转换的表格')
521
- os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
522
- continue
523
- df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
524
- if len(df) == 0:
525
- print(f'{name} 报表数据为空')
526
- os.remove(os.path.join(root, name))
527
- continue
528
- if '日期' in df.columns.tolist():
529
- df.pop('日期')
530
- new_date = '-'.join(date[0])
531
- df.insert(loc=0, column='日期', value=new_date)
532
- df.replace(to_replace=['--'], value='', regex=False, inplace=True)
533
- df['宝贝ID'] = df['宝贝ID'].apply(
534
- lambda x: f'="{x}"' if x and '=' not in str(x) else x
535
- )
536
- df['商家编码'] = df['商家编码'].apply(
537
- lambda x: f'="{x}"' if x and '=' not in str(x) else x
538
- )
539
- name_st = re.findall(r'(.*)\d{4}\d{2}\d{2}\.', str(name)) # baobeitrans-
540
- new_name = f'{name_st[0]}{new_date}.csv'
541
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
542
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
543
- if self.set_up_to_mogo:
544
- d.df_to_mongo(df=df, db_name='生意经1', collection_name='生意经_宝贝指标')
545
- if self.set_up_to_mysql:
546
- m.df_to_mysql(df=df, db_name='生意经1', tabel_name='生意经_宝贝指标')
547
- os.remove(os.path.join(root, name))
548
-
549
- elif name.endswith('.csv') and '店铺销售指标' in name:
550
- # 生意经, 店铺指标,仅限月数据,实际日指标也可以
551
- name_st = re.findall(r'(.*)\(分日', name)
552
- if not name_st:
553
- print(f'{name} 已转换的表格')
554
- continue
555
- df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
556
- if len(df) == 0:
557
- print(f'{name} 报表数据为空')
558
- os.remove(os.path.join(root, name))
559
- continue
560
- df['日期'] = df['日期'].astype(str).apply(
561
- lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', x)[0]) if x else x)
562
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore') # 转换日期列
563
- # min_clm = str(df.min()['日期']).split(' ')[0]
564
- # max_clm = str(df.max()['日期']).split(' ')[0]
565
- min_clm = str(df['日期'].min()).split(' ')[0]
566
- max_clm = str(df['日期'].max()).split(' ')[0]
567
- new_name = f'{name_st[0]}-{min_clm}_{max_clm}.csv' # 保存时将(分日)去掉
568
- df.replace(to_replace=['--'], value='', regex=False, inplace=True)
569
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
570
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
571
- if self.set_up_to_mogo:
572
- d.df_to_mongo(df=df, db_name='生意经1', collection_name='生意经_店铺指标')
573
- if self.set_up_to_mysql:
574
- m.df_to_mysql(df=df, db_name='生意经1', tabel_name='生意经_店铺指标')
575
- os.remove(os.path.join(root, name))
576
-
577
- elif name.endswith('csv') and '省份' in name:
578
- # 生意经,地域分布, 仅限日数据
579
- pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})\.', name)
580
- if not pattern or '省份城市分析2' not in name:
581
- print(f'{name} 不支持或已转换的表格')
582
- os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
583
- continue
584
- date = pattern[0][1:]
585
- date = '-'.join(date)
586
- new_name = f'{pattern[0][0]}-{date}.csv'
587
- df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
588
- if len(df) == 0:
589
- print(f'{name} 报表数据为空')
590
- os.remove(os.path.join(root, name))
591
- continue
592
- df['省'] = df['省份'].apply(lambda x: x if ' ├─ ' not in x and ' └─ ' not in x else None)
593
- df['城市'] = df[['省份', '省']].apply(lambda x: '汇总' if x['省'] else x['省份'], axis=1)
594
- df['省'].fillna(method='ffill', inplace=True)
595
- df['城市'].replace(to_replace=[' ├─ | └─ '], value='', regex=True, inplace=True)
596
- pov = df.pop('省')
597
- city = df.pop('城市')
598
- df['省+市'] = df['省份']
599
- df['省份'] = pov
600
- df.insert(loc=1, column='城市', value=city)
601
- df.insert(loc=0, column='日期', value=date)
602
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
603
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
604
- if self.set_up_to_mogo:
605
- d.df_to_mongo(df=df, db_name='生意经1', collection_name='生意经_地域分布_省份城市分析')
606
- if self.set_up_to_mysql:
607
- m.df_to_mysql(df=df, db_name='生意经1', tabel_name='生意经_地域分布_省份城市分析')
608
- os.remove(os.path.join(root, name)) # 移除已转换的原文件
609
-
610
- elif name.endswith('csv') and 'order' in name:
611
- # 生意经,订单数据,仅限月数据
612
- pattern = re.findall(r'(.*)(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})', name)
613
- if not pattern:
614
- print(f'{name} 不支持或已转换的表格')
615
- os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
616
- continue
617
- date1 = pattern[0][1:4]
618
- date1 = '-'.join(date1)
619
- date2 = pattern[0][4:]
620
- date2 = '-'.join(date2)
621
- date = f'{date1}_{date2}'
622
- new_name = f'{pattern[0][0]}{date}.csv'
623
- df = pd.read_csv(os.path.join(root, name), encoding='gb18030', header=0, na_filter=False)
624
- if len(df) == 0:
625
- print(f'{name} 报表数据为空')
626
- os.remove(os.path.join(root, name))
627
- continue
628
- df.insert(loc=0, column='日期', value=date1)
629
- df.insert(loc=1, column='数据周期', value=date)
630
- df['商品id'] = df['宝贝链接'].apply(
631
- lambda x: f'=\"{"".join(re.findall("id=(.*)", str(x))[0])}\"' if x else x)
632
- df.rename(columns={'宝贝标题': '商品标题', '宝贝链接': '商品链接'}, inplace=True)
633
- df['颜色编码'] = df['商家编码'].apply(
634
- lambda x: ''.join(re.findall(r' .*(\d{4})$', str(x))) if x else x)
635
- df['商家编码'] = df['商家编码'].apply(lambda x: f'="{x}"' if x else x)
636
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
637
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
638
- if self.set_up_to_mogo:
639
- d.df_to_mongo(df=df, db_name='生意经1', collection_name='生意经_订单数据')
640
- if self.set_up_to_mysql:
641
- m.df_to_mysql(df=df, db_name='生意经1', tabel_name='生意经_订单数据')
642
- os.remove(os.path.join(root, name)) # 移除已转换的原文件
643
-
644
- elif name.endswith('.xlsx') and '直播间成交订单明细' in name:
645
- # 直播间成交订单明细
646
- df = pd.read_excel(os.path.join(root, name), header=0)
647
- if len(df) == 0:
648
- print(f'{name} 报表数据为空')
649
- os.remove(os.path.join(root, name))
650
- continue
651
- df.rename(columns={'场次ID': '场次id', '商品ID': '商品id'}, inplace=True)
652
- df.replace(to_replace=['-'], value='', regex=False, inplace=True)
653
- cols = ['开播时间', '下单时间', '支付时间', '确认收货时间']
654
- for col in cols:
655
- df[col] = pd.to_datetime(df[col]) # 转换日期列
656
- for col2 in ['支付金额', '确认收货金额']:
657
- df[col2] = pd.to_numeric(df[col2], errors='ignore')
658
- df['日期'] = df['支付时间'].apply(lambda x: x.strftime('%Y-%m-%d'))
659
- date_min = df['日期'].values.min() + '_'
660
- date_max = df['日期'].values.max()
661
- new_name = '直播间成交订单明细_' + date_min + date_max + '.csv'
662
- for col3 in ['场次id', '商品id', '父订单', '子订单']:
663
- df[col3] = df[col3].apply(
664
- lambda x: "{0}{1}{2}".format('="', x, '"') if x and '=' not in str(x) else x
665
- )
666
- col4 = ['日期', '直播标题', '开播时间', '场次id', '支付时间', '支付金额', '商品id', '商品标题',
667
- '商品一级类目', '父订单', '子订单', '下单时间', '确认收货时间', '确认收货金额']
668
- df_lin = df[col4]
669
- # 调整列顺序
670
- df = pd.merge(df_lin, df, how='outer', on=col4)
671
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
672
- if self.set_up_to_mogo:
673
- d.df_to_mongo(df=df, db_name='生意参谋2', collection_name='生意参谋_直播间成交订单明细')
674
- if self.set_up_to_mysql:
675
- m.df_to_mysql(df=df, db_name='生意参谋2', tabel_name='生意参谋_直播间成交订单明细')
676
- os.remove(os.path.join(root, name))
677
-
678
- elif name.endswith('.xlsx') and '直播间大盘数据' in name:
679
- # 直播间大盘数据
680
- df = pd.read_excel(os.path.join(root, name), header=0)
681
- if len(df) == 0:
682
- print(f'{name} 报表数据为空')
683
- os.remove(os.path.join(root, name))
684
- continue
685
- df.replace(to_replace=['-'], value='', regex=False, inplace=True)
686
- df.rename(columns={'统计日期': '日期'}, inplace=True)
687
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
688
- df['日期'] = df['日期'].apply(lambda x: x.strftime('%Y-%m-%d'))
689
- date_min = df['日期'].values.min() + '_'
690
- date_max = df['日期'].values.max()
691
- new_name = '直播间大盘数据_' + date_min + date_max + '.csv'
692
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
693
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
694
- if self.set_up_to_mogo:
695
- d.df_to_mongo(df=df, db_name='生意参谋2', collection_name='生意参谋_直播间大盘数据')
696
- if self.set_up_to_mysql:
697
- m.df_to_mysql(df=df, db_name='生意参谋2', tabel_name='生意参谋_直播间大盘数据')
698
- os.remove(os.path.join(root, name))
699
-
700
- elif name.endswith('.xls') and '直播业绩-成交拆解' in name:
701
- # 直播业绩-成交拆解
702
- df = pd.read_excel(os.path.join(root, name), header=5)
703
- if len(df) == 0:
704
- print(f'{name} 报表数据为空')
705
- os.remove(os.path.join(root, name))
706
- continue
707
- df.replace(to_replace=['-'], value='', regex=False, inplace=True)
708
- df.replace(to_replace=[','], value='', regex=True, inplace=True)
709
- df.rename(columns={'统计日期': '日期'}, inplace=True)
710
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
711
- df['日期'] = df['日期'].apply(lambda x: x.strftime('%Y-%m-%d'))
712
- date_min = df['日期'].values.min() + '_'
713
- date_max = df['日期'].values.max()
714
- new_name = '直播业绩_成交拆解_' + date_min + date_max + '.csv'
715
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
716
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
717
- if self.set_up_to_mogo:
718
- d.df_to_mongo(df=df, db_name='生意参谋2', collection_name='生意参谋_直播业绩')
719
- if self.set_up_to_mysql:
720
- m.df_to_mysql(df=df, db_name='生意参谋2', tabel_name='生意参谋_直播业绩')
721
- os.remove(os.path.join(root, name))
722
-
723
- elif name.endswith('.xlsx') and '明星店铺' in name:
724
- # 品销宝
725
- pattern = re.findall(r'_(\d{4}-\d{2}-\d{2})_', name)
726
- if pattern:
727
- continue
728
- sheets4 = ['账户', '推广计划', '推广单元', '创意', '品牌流量包', '定向人群'] # 品销宝
729
- file_name4 = os.path.splitext(name)[0] # 明星店铺报表
730
- for sheet4 in sheets4:
731
- df = pd.read_excel(os.path.join(root, name), sheet_name=sheet4, header=0, engine='openpyxl')
732
- # print(sheet4)
733
- if len(df) == 0:
734
- print(f'{name} 报表数据为空')
735
- os.remove(os.path.join(root, name))
736
- continue
737
- if len(df) < 1:
738
- print(f'{name} 跳过')
739
- continue
740
- else:
741
- df.insert(loc=1, column='报表类型', value=sheet4)
742
- df.fillna(0, inplace=True)
743
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore') # 转换日期列
744
- min_clm = str(df['日期'].min()).split(' ')[0]
745
- max_clm = str(df['日期'].max()).split(' ')[0]
746
- new_file_name4 = f'{sheet4}_{file_name4}_{min_clm}_{max_clm}.csv'
747
- # 以sheet名进一步创建子文件夹
748
- root_new = str(pathlib.Path(self.source_path, '推广报表/品销宝', sheet4))
749
- self.save_to_csv(df, root_new, new_file_name4) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
750
- if self.set_up_to_mogo:
751
- d.df_to_mongo(df=df, db_name='天猫数据1', collection_name='天猫_推广_品销宝')
752
- if self.set_up_to_mysql:
753
- m.df_to_mysql(df=df, db_name='天猫数据1', tabel_name='天猫_推广_品销宝')
754
- os.remove(os.path.join(root, name))
755
-
756
- elif name.endswith('.csv') and '淘宝店铺数据' in name:
757
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
758
- if self.set_up_to_mogo:
759
- d.df_to_mongo(df=df, db_name='市场数据1', collection_name='淘宝店铺数据')
760
- if self.set_up_to_mysql:
761
- m.df_to_mysql(df=df, db_name='市场数据1', tabel_name='淘宝店铺数据')
762
-
763
- elif name.endswith('.csv') and '人群洞察' in name:
764
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
765
- df.replace(to_replace=['--'], value='', regex=False, inplace=True)
766
- df = df[df['人群规模'] != '']
767
- if len(df) == 0:
768
- os.remove(os.path.join(root, name))
769
- print(f'{name}: 数据为空, 已移除: {os.path.join(root, name)}')
770
- continue
771
- if self.set_up_to_mogo:
772
- d.df_to_mongo(df=df, db_name='天猫数据1', collection_name='万相台_人群洞察')
773
- if self.set_up_to_mysql:
774
- m.df_to_mysql(df=df, db_name='天猫数据1', tabel_name='万相台_人群洞察')
775
-
776
- # ----------------------- 京东数据处理分界线 -----------------------
777
- elif name.endswith('.csv') and '关键词点击成交报表_pbix同步_勿删改' in name:
778
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
779
- for col in df.columns.tolist():
780
- if '(' in col:
781
- new_col = re.sub('[()]', '_', col)
782
- new_col = new_col.strip('_')
783
- df.rename(columns={col: new_col}, inplace=True)
784
- df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
785
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
786
- min_clm = str(df['日期'].min()).split(' ')[0]
787
- max_clm = str(df['日期'].max()).split(' ')[0]
788
- new_name = f'京东推广关键词点击成交报表_{min_clm}_{max_clm}.csv'
789
- self.save_to_csv(df, root, new_name)
790
- os.remove(os.path.join(root, name))
791
- elif name.endswith('.csv') and '营销概况_全站营销' in name:
792
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=1, na_filter=False)
793
- df = df[(df['日期'] != '日期') & (df['日期'] != '汇总') & (df['日期'] != '0') & (df['花费'] != '0') & (df['花费'] != '0.00')]
794
- df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
795
- df.drop("'当前时间'", axis=1, inplace=True)
796
- df.rename(columns={'全站ROI': '全站roi'}, inplace=True)
797
- df.insert(loc=1, column='产品线', value='全站营销')
798
- new_name = re.sub('至', '_', name)
799
- self.save_to_csv(df, root, new_name)
800
- os.remove(os.path.join(root, name))
801
- elif name.endswith('.xlsx') and '店铺来源_流量来源' in name:
802
- # 京东店铺来源
803
- if '按天' not in name:
804
- print(f'{name} 京东流量请按天下载')
805
- continue
806
- new_name = name.split(r'__20')[0]
807
- date01 = re.findall(r'(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
808
- new_date01 = f'{date01[0][0]}-{date01[0][1]}-{date01[0][2]}'
809
- new_date02 = f'{date01[0][3]}-{date01[0][4]}-{date01[0][5]}'
810
- new_date03 = f'{new_date01}_{new_date02}'
811
- df = pd.read_excel(os.path.join(root, name), header=0)
812
- if len(df) == 0:
813
- print(f'{name} 报表数据为空')
814
- os.remove(os.path.join(root, name))
815
- continue
816
- df.replace(to_replace=['-'], value='', regex=False, inplace=True)
817
- df.insert(loc=0, column='日期', value=new_date01)
818
- if new_date01 != new_date02:
819
- df.insert(loc=1, column='数据周期', value=new_date03)
820
- cols = df.columns.tolist()
821
- if '三级来源' in cols:
822
- source = '三级来源'
823
- elif '二级来源' in cols:
824
- source = '二级来源'
825
- else:
826
- source = '一级来源'
827
-
828
- new_name = f'{new_name}_{source}_{new_date03}.csv'
829
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
830
- self.save_to_csv(df, root, new_name) # csv 文件仍然保留这些列
831
- for col_2024 in cols: # 京东这个表有字段加了去年日期,删除这些同比数据字段,不然列数量爆炸
832
- if '20' in col_2024 and '流量来源' in name:
833
- df.drop(col_2024, axis=1, inplace=True)
834
- if self.set_up_to_mogo:
835
- d.df_to_mongo(df=df,db_name='京东数据1', collection_name='京东_流量来源_日数据')
836
- if self.set_up_to_mysql:
837
- m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_流量来源_日数据')
838
- os.remove(os.path.join(root, name))
839
-
840
- elif name.endswith('.xlsx') and '全部渠道_商品明细' in name:
841
- # 京东商品明细 文件转换
842
- date1 = re.findall(r'_(\d{4})(\d{2})(\d{2})_全部', str(name))
843
- if not date1[0]:
844
- print(f'{name}: 仅支持日数据')
845
- continue
846
- if date1:
847
- date1 = f'{date1[0][0]}-{date1[0][1]}-{date1[0][2]}'
848
- df = pd.read_excel(os.path.join(root, name), header=0)
849
- if len(df) == 0:
850
- print(f'{name} 报表数据为空')
851
- os.remove(os.path.join(root, name))
852
- continue
853
- if '10035975359247' in df['商品ID'].values or '10056642622343' in df['商品ID'].values:
854
- new_name = f'sku_{date1}_全部渠道_商品明细.csv'
855
- elif '10021440233518' in df['商品ID'].values or '10022867813485' in df['商品ID'].values:
856
- new_name = f'spu_{date1}_全部渠道_商品明细.csv'
857
- else:
858
- new_name = f'未分类_{date1}_全部渠道_商品明细.csv'
859
- df.replace(to_replace=['-'], value='', regex=False, inplace=True)
860
- df.rename(columns={'商品ID': '商品id'}, inplace=True)
861
- df['商品id'] = df['商品id'].apply(lambda x: f'="{x}"' if x else x)
862
- df['货号'] = df['货号'].apply(lambda x: f'="{x}"' if x else x)
863
- df.insert(loc=0, column='日期', value=date1)
864
-
865
- self.save_to_csv(df, root, new_name)
866
- if self.set_up_to_mogo:
867
- if 'sku' in new_name:
868
- d.df_to_mongo(df=df,db_name='京东数据1', collection_name='京东_sku_商品明细')
869
- elif 'spu' in new_name:
870
- d.df_to_mongo(df=df,db_name='京东数据1', collection_name='京东_spu_商品明细')
871
- if self.set_up_to_mysql:
872
- if 'sku' in new_name:
873
- m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_sku_商品明细')
874
- elif 'spu' in new_name:
875
- m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_spu_商品明细')
876
- os.remove(os.path.join(root, name))
877
- elif name.endswith('.xlsx') and '搜索分析-排名定位-商品词下排名' in name:
878
- # 京东商品词下排名
879
- pattern = re.findall(r'(\d{4}-\d{2}-\d{2})-(\d{4}-\d{2}-\d{2})', name)
880
- if not pattern:
881
- os.remove(os.path.join(root, name))
882
- continue
883
- if pattern[0][0] == pattern[0][1]:
884
- print(f'{name}: 检测到数据周期异常,仅支持7天数据')
885
- os.remove(os.path.join(root, name))
886
- continue
887
- new_name = os.path.splitext(name)[0] + '.csv'
888
- # print(name)
889
- df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
890
- if len(df) == 0:
891
- print(f'{name} 报表数据为空')
892
- os.remove(os.path.join(root, name))
893
- continue
894
- if len(df.columns.tolist()) < 20:
895
- print(f'{name}: 报表可能缺失诊断数据')
896
- os.remove(os.path.join(root, name))
897
- continue
898
- df.rename(columns={'商品的ID': 'skuid'}, inplace=True)
899
- df['skuid'] = df['skuid'].apply(lambda x: f'="{x}"' if x and '=' not in str(x) else x)
900
- self.save_to_csv(df, root, new_name)
901
- if self.set_up_to_mogo:
902
- d.df_to_mongo(df=df,db_name='京东数据1', collection_name='京东_商品词下排名')
903
- if self.set_up_to_mysql:
904
- m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_商品词下排名')
905
- os.remove(os.path.join(root, name)) # 移除已转换的原文件
906
-
907
- elif name.endswith('.xlsx') and '搜索分析-排名定位-商品排名' in name:
908
- # 京东商品排名
909
- new_name = os.path.splitext(name)[0] + '.csv'
910
- date_in = re.findall(r'(\d{4}-\d{2}-\d{2})-搜索', str(name))[0]
911
- df = pd.read_excel(os.path.join(root, name), header=0)
912
- if len(df) == 0:
913
- print(f'{name} 报表数据为空')
914
- os.remove(os.path.join(root, name))
915
- continue
916
- df.insert(0, '日期', date_in) # 插入新列
917
- df.rename(columns={'SKU': 'skuid'}, inplace=True)
918
- df['skuid'] = df['skuid'].apply(lambda x: f'="{x}"' if x and '=' not in str(x) else x)
919
- self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
920
- if self.set_up_to_mogo:
921
- d.df_to_mongo(df=df,db_name='京东数据1', collection_name='京东_商品排名')
922
- if self.set_up_to_mysql:
923
- m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_商品排名')
924
- os.remove(os.path.join(root, name)) # 移除已转换的原文件
925
-
926
- elif name.endswith('.xls') and '竞店概况_竞店详情' in name:
927
- # 京东,竞争-竞店概况-竞店详情-全部渠道
928
- date01 = re.findall(r'全部渠道_(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
929
- start_date = f'{date01[0][0]}-{date01[0][1]}-{date01[0][2]}'
930
- end_date = f'{date01[0][3]}-{date01[0][4]}-{date01[0][5]}'
931
- df = pd.read_excel(os.path.join(root, name), header=0)
932
- if len(df) == 0:
933
- print(f'{name} 报表数据为空')
934
- os.remove(os.path.join(root, name))
935
- continue
936
- df.replace(to_replace=[','], value='', regex=True, inplace=True)
937
- df.insert(loc=0, column='日期', value=start_date)
938
- new_name = f'{os.path.splitext(name)[0]}'
939
- new_name = re.sub(r'\d{8}_\d{8}', f'{start_date}_{end_date}', new_name)
940
- self.save_to_csv(df, root, new_name)
941
- if self.set_up_to_mogo:
942
- d.df_to_mongo(df=df,db_name='京东数据1', collection_name='京东_竞店监控_日数据')
943
- if self.set_up_to_mysql:
944
- m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_竞店监控_日数据')
945
- os.remove(os.path.join(root, name))
946
-
947
- elif name.endswith('.xls') and ('JD店铺日报_店铺' in name or '店铺_20' in name):
948
- # 京东 自助报表 店铺日报
949
- df = pd.read_excel(os.path.join(root, name), header=0)
950
- if len(df) == 0:
951
- print(f'{name} 报表数据为空')
952
- os.remove(os.path.join(root, name))
953
- continue
954
- if '访客数-全部渠道' not in df.columns.tolist(): # 识别是否真的京东日报
955
- continue
956
- df['日期'] = df['日期'].apply(
957
- lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', str(x))[0])
958
- )
959
- date_min = df['日期'].values.min()
960
- date_max = df['日期'].values.max()
961
- # df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
962
- new_name = f'JD店铺日报_' + re.findall(r"(.*)\d{8}_\d{8}", name)[0] + f'_{date_min}_{date_max}.csv'
963
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
964
- if self.set_up_to_mogo:
965
- d.df_to_mongo(df=df, db_name='京东数据1', collection_name='京东_自助取数_店铺日报')
966
- if self.set_up_to_mysql:
967
- m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_自助取数_店铺日报')
968
- os.remove(os.path.join(root, name))
969
-
970
- elif name.endswith('.xls') and '商家榜单_女包_整体' in name:
971
- # 京东 行业 商家榜单
972
- date2 = re.findall(r'_\d{8}-\d+', name)
973
- if date2:
974
- print(f'{name}: 请下载日数据,不支持其他周期')
975
- os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
976
- continue
977
- date1 = re.findall(r'_(\d{4})(\d{2})(\d{2})', name)
978
- date1 = f'{date1[0][0]}-{date1[0][1]}-{date1[0][2]}'
979
- df = pd.read_excel(os.path.join(root, name), header=0)
980
- if len(df) == 0:
981
- print(f'{name} 报表数据为空')
982
- os.remove(os.path.join(root, name))
983
- continue
984
- df['日期'] = df['日期'].astype(str).apply(lambda x: f'{x[:4]}-{x[4:6]}-{x[6:8]}')
985
- df.insert(loc=0, column='类型', value='商家榜单')
986
- new_name = f'{os.path.splitext(name)[0]}_{date1}.csv'
987
- self.save_to_csv(df, root, new_name)
988
- if self.set_up_to_mogo:
989
- d.df_to_mongo(df=df,db_name='京东数据1', collection_name='京东_商家榜单')
990
- if self.set_up_to_mysql:
991
- m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_商家榜单')
992
- os.remove(os.path.join(root, name))
993
-
994
- elif name.endswith('.xlsx') and '批量SKU导出-批量任务' in name:
995
- # 京东 sku 导出
996
- df = pd.read_excel(os.path.join(root, name), header=0)
997
- if len(df) == 0:
998
- print(f'{name} 报表数据为空')
999
- os.remove(os.path.join(root, name))
1000
- continue
1001
- d_time = datetime.datetime.today().strftime('%Y-%m-%d')
1002
- df.insert(loc=0, column='日期', value=d_time)
1003
- for col in ['SKUID', '商品编码', '商家SKU', '货号']:
1004
- df[col] = df[col].apply(lambda x: f'="{x}"' if x else x)
1005
- df['商品链接'] = df['商品链接'].apply(lambda x: f'https://{x}' if x else x)
1006
- new_name = f'京东商品信息_{os.path.splitext(name)[0]}_{d_time}.csv'
1007
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
1008
- if self.set_up_to_mogo:
1009
- d.df_to_mongo(df=df, db_name='属性设置1', collection_name='京东商品信息')
1010
- if self.set_up_to_mysql:
1011
- m.df_to_mysql(df=df, db_name='属性设置1', tabel_name='京东商品信息')
1012
- os.remove(os.path.join(root, name))
1013
-
1014
- elif name.endswith('.xlsx') and '批量SPU导出-批量任务' in name:
1015
- # 京东 spu 导出
1016
- df = pd.read_excel(os.path.join(root, name), header=0)
1017
- if len(df) == 0:
1018
- print(f'{name} 报表数据为空')
1019
- os.remove(os.path.join(root, name))
1020
- continue
1021
- d_time = datetime.datetime.today().strftime('%Y-%m-%d')
1022
- df.insert(loc=0, column='日期', value=d_time)
1023
- for col in ['商品编码', '货号']:
1024
- df[col] = df[col].apply(lambda x: f'="{x}"' if x else x)
1025
- new_name = f'京东商品信息_{os.path.splitext(name)[0]}_{d_time}.csv'
1026
-
1027
- self.save_to_csv(df, root, new_name)
1028
- os.remove(os.path.join(root, name))
1029
-
1030
- elif name.endswith('.csv') and '万里马箱包推广1_完整点击成交' in name:
1031
- # 京东推广数据
1032
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
1033
- if len(df) == 0:
1034
- print(f'{name} 报表数据为空')
1035
- os.remove(os.path.join(root, name))
1036
- continue
1037
- pic_list = df['日期'].tolist()
1038
- pic = []
1039
- for i in pic_list:
1040
- pics = re.findall(pattern=r'(\d{4})(\d{2})(\d{2})', string=str(i))
1041
- if pics:
1042
- pics = '-'.join(pics[0])
1043
- pic.append(pics)
1044
- else:
1045
- pic.append(i)
1046
- df['日期'] = pd.Series(pic)
1047
- date_min = df['日期'].values.min() + '_'
1048
- date_max = df['日期'].values.max()
1049
- new_name2 = '京东点击成交报表_' + date_min + date_max + '.csv'
1050
- for col in ['计划ID', '触发SKU ID', '跟单SKU ID', 'SPU ID']:
1051
- df[col] = df[col].astype(str).apply(lambda x: f'="{x}"' if x and '=' not in x else x)
1052
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
1053
- self.save_to_csv(df, root, new_name2) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
1054
- if self.set_up_to_mogo:
1055
- d.df_to_mongo(df=df, db_name='京东数据1', collection_name='京东_推广_京准通')
1056
- if self.set_up_to_mysql:
1057
- m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_推广_京准通')
1058
- os.remove(os.path.join(root, name))
1059
- elif name.endswith('.csv') and '万里马箱包推广1_京东推广搜索词_pbix同步不要' in name:
1060
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
1061
- if len(df) == 0:
1062
- print(f'{name} 报表数据为空')
1063
- os.remove(os.path.join(root, name))
1064
- continue
1065
- pic_list = df['日期'].tolist()
1066
- pic = []
1067
- for i in pic_list:
1068
- pics = re.findall(pattern=r'(\d{4})(\d{2})(\d{2})', string=str(i))
1069
- if pics:
1070
- pics = '-'.join(pics[0])
1071
- pic.append(pics)
1072
- else:
1073
- pic.append(i)
1074
- df['日期'] = pd.Series(pic)
1075
- date_min = df['日期'].values.min() + '_'
1076
- date_max = df['日期'].values.max()
1077
- new_name2 = '京东推广搜索词_' + date_min + date_max + '.csv'
1078
- df.replace(to_replace=[0], value='', regex=False, inplace=True)
1079
- df['是否品牌词'] = df['搜索词'].str.contains('万里马|wanlima', regex=True)
1080
- df['是否品牌词'] = df['是否品牌词'].apply(lambda x: '品牌词' if x else '')
1081
- self.save_to_csv(df, root, new_name2) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
1082
- if self.set_up_to_mogo:
1083
- d.df_to_mongo(df=df, db_name='京东数据1', collection_name='京东_推广_搜索词报表')
1084
- if self.set_up_to_mysql:
1085
- m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_推广_搜索词报表')
1086
- os.remove(os.path.join(root, name))
1087
-
1088
- elif name.endswith('.xlsx') and '零售明细统计' in name:
1089
- #
1090
- df = pd.read_excel(os.path.join(root, name), header=0)
1091
- if len(df) == 0:
1092
- print(f'{name} 报表数据为空')
1093
- os.remove(os.path.join(root, name))
1094
- continue
1095
- df['摘要'] = df['摘要'].apply(lambda x: re.sub('\'', '', str(x)) if x else x)
1096
- for col in ['原单号', '商品代码', '摘要']:
1097
- df[col] = df[col].apply(lambda x: f'="{re.sub(".0", "", str(x))}"' if x else x)
1098
- df = df[df['缩略图'] != '合计']
1099
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
1100
- date_min = f'_{re.sub("T.*", "", str(df["日期"].values.min()))}_'
1101
- date_max = f'{re.sub("T.*", "", str(df["日期"].values.max()))}.csv'
1102
- new_name = re.findall(r'(.*)_\d{4}-\d{2}-\d{2}', name)[0]
1103
- new_name = f'{new_name}{date_min}{date_max}'
1104
- self.save_to_csv(df, root, new_name) # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
1105
- if self.set_up_to_mogo:
1106
- d.df_to_mongo(df=df, db_name='生意经1', collection_name='E3_零售明细统计')
1107
- if self.set_up_to_mysql:
1108
- m.df_to_mysql(df=df, db_name='生意经1', tabel_name='E3_零售明细统计')
1109
- os.remove(os.path.join(root, name))
1110
- except Exception as e:
1111
- now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S ')
1112
- print(f'{now}{name}: 报错: {e}')
1113
- if self.set_up_to_mogo:
1114
- if d.client:
1115
- d.client.close() # 必须手动关闭数据库连接
1116
-
1117
- """
1118
- {文件分类}
1119
- 将已处理完的文件 分类移到原始文件夹下
1120
- 此处t_path参数定义了子文件夹的生成名称
1121
- """
1122
-
1123
- @staticmethod
1124
- def move_files(path, _name, target_path, _as_month=None):
1125
- """
1126
- name: 移动的文件名,
1127
- target_path: 目标位置
1128
- """
1129
- t2 = target_path # t2 赋值有用, 不能省略
1130
- if not os.path.exists(t2): # 如果目录不存在则创建
1131
- os.makedirs(t2, exist_ok=True)
1132
- if _as_month:
1133
- _date = re.findall(r'(\d{4}-\d{2})-\d{2}', str(_name))
1134
- if _date:
1135
- _date = _date[0]
1136
- t2 = pathlib.Path(t2, _date) # 添加 年月分类
1137
- if not os.path.exists(t2):
1138
- os.makedirs(t2, exist_ok=True)
1139
- old_file = os.path.join(t2, _name) # 检查目标位置是否已经存在该文件
1140
- if os.path.isfile(old_file):
1141
- os.remove(old_file) # 如果存在则移除
1142
- shutil.move(os.path.join(path, _name), t2) # 将文件从下载文件夹移到目标位置
1143
-
1144
- # @try_except
1145
- def move_all(self, path=None, is_except=[]):
1146
- if not path:
1147
- path = self.path
1148
- for root, dirs, files in os.walk(path, topdown=False):
1149
- for name in files:
1150
- # print(name)
1151
- is_continue = False
1152
- if is_except:
1153
- for item in is_except:
1154
- # print(item, f'-----', os.path.join(root, name))
1155
- if item in os.path.join(root, name):
1156
- # print(name)
1157
- is_continue = True
1158
- break
1159
- if is_continue: # 需要排除不做处理的文件或文件夹
1160
- continue
1161
- # print(is_except, is_continue)
1162
- def bib(paths, _as_month=None):
1163
- """闭包函数"""
1164
- self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
1165
-
1166
- if name.endswith('.csv') and '无线店铺流量来源' in name:
1167
- date01 = re.findall(r'\d{4}-\d{2}-(\d{2})_\d{4}-\d{2}-(\d{2})', name)
1168
-
1169
- if int(date01[0][1]) - int(date01[0][0]) > 15:
1170
- t_path = str(pathlib.Path(self.source_path, '月数据/流量来源_旧版'))
1171
- bib(t_path)
1172
- elif '_新版' in name:
1173
- t_path = str(pathlib.Path(self.source_path, '生意参谋/流量来源'))
1174
- bib(t_path, _as_month=True)
1175
- else:
1176
- t_path = str(pathlib.Path(self.source_path, '生意参谋/流量来源_旧版'))
1177
- bib(t_path, _as_month=True)
1178
- elif name.endswith('.csv') and '生意参谋' in name and '无线店铺三级流量来源详情' in name:
1179
- t_path = str(pathlib.Path(self.source_path, '生意参谋/手淘搜索来源'))
1180
- bib(t_path, _as_month=True)
1181
- elif name.endswith('.csv') and '商品_全部' in name:
1182
- t_path = str(pathlib.Path(self.source_path, '生意参谋/商品排行'))
1183
- bib(t_path, _as_month=True)
1184
- elif name.endswith('.csv') and '参谋店铺整体日报' in name:
1185
- t_path = str(pathlib.Path(self.source_path, '生意参谋/全店数据-自助取数'))
1186
- bib(t_path, _as_month=True)
1187
- elif name.endswith('.csv') and '参谋每日流量_自助取数' in name:
1188
- t_path = str(pathlib.Path(self.source_path, '生意参谋/流量来源-自助取数'))
1189
- bib(t_path, _as_month=True)
1190
- elif name.endswith('.csv') and '商品sku' in name:
1191
- t_path = str(pathlib.Path(self.source_path, '生意参谋/商品sku-自助取数'))
1192
- bib(t_path, _as_month=True)
1193
- elif name.endswith('.csv') and '参谋店铺流量来源(月)' in name:
1194
- t_path = str(pathlib.Path(self.source_path, '月数据/流量来源-自助取数-月数据'))
1195
- bib(t_path, _as_month=True)
1196
- elif name.endswith('.csv') and '淘宝联盟_分天数据_计划_活动类型_推广概览_数据汇总' in name:
1197
- t_path = str(pathlib.Path(self.source_path, '月数据/淘宝联盟'))
1198
- bib(t_path, _as_month=False)
1199
- elif name.endswith('.csv') and '竞店分析' in name and '来源分析-入店来源' in name:
1200
- t_path = str(pathlib.Path(self.source_path, '市场数据/竞店分析/来源分析/入店来源'))
1201
- bib(t_path, _as_month=False)
1202
- elif name.endswith('.csv') and '竞店分析' in name and '来源分析-入店搜索词' in name:
1203
- t_path = str(pathlib.Path(self.source_path, '市场数据/竞店分析/来源分析/入店搜索词'))
1204
- bib(t_path, _as_month=False)
1205
- elif name.endswith('.csv') and '竞店分析' in name and '销售分析-关键指标对比' in name:
1206
- t_path = str(pathlib.Path(self.source_path, '市场数据/竞店分析/销售分析/关键指标对比'))
1207
- bib(t_path, _as_month=False)
1208
- elif name.endswith('.csv') and '竞店分析' in name and '销售分析-top商品榜' in name:
1209
- t_path = str(pathlib.Path(self.source_path, '市场数据/竞店分析/销售分析/top商品榜'))
1210
- bib(t_path, _as_month=False)
1211
- elif name.endswith('.csv') and '监控店铺数据' in name:
1212
- t_path = str(pathlib.Path(self.source_path, '市场数据/监控店铺数据'))
1213
- bib(t_path, _as_month=True)
1214
- elif name.endswith('.csv') and '监控商品' in name:
1215
- t_path = str(pathlib.Path(self.source_path, '市场数据/监控商品数据'))
1216
- bib(t_path, _as_month=True)
1217
- # elif name.endswith('.csv') and '竞店分析-流量分析' in name:
1218
- # t_path = str(pathlib.Path(self.source_path, '市场数据/竞店流量构成'))
1219
- # bib(t_path, _as_month=True)
1220
- elif name.endswith('.csv') and '类目洞察' in name and '属性分析_分析明细_汇总' in name:
1221
- t_path = str(pathlib.Path(self.source_path, '市场数据/类目洞察/属性分析/汇总'))
1222
- bib(t_path, _as_month=True)
1223
- elif name.endswith('.csv') and '类目洞察' in name and '属性分析_分析明细_商品发现' in name:
1224
- t_path = str(pathlib.Path(self.source_path, '市场数据/类目洞察/属性分析/商品发现'))
1225
- bib(t_path, _as_month=True)
1226
- elif name.endswith('.csv') and '类目洞察' in name and '价格分析_分析明细_汇总' in name:
1227
- t_path = str(pathlib.Path(self.source_path, '市场数据/类目洞察/价格分析/汇总'))
1228
- bib(t_path, _as_month=True)
1229
- elif name.endswith('.csv') and '类目洞察' in name and '价格分析_分析明细_商品发现' in name:
1230
- t_path = str(pathlib.Path(self.source_path, '市场数据/类目洞察/价格分析/商品发现'))
1231
- bib(t_path, _as_month=True)
1232
- elif name.endswith('.csv') and '搜索排行_搜索' in name:
1233
- t_path = str(pathlib.Path(self.source_path, '市场数据/搜索排行'))
1234
- bib(t_path, _as_month=True)
1235
- elif name.endswith('.csv') and '市场排行_店铺排行' in name:
1236
- t_path = str(pathlib.Path(self.source_path, '市场数据/市场二级类目店铺'))
1237
- bib(t_path, _as_month=True)
1238
- elif name.endswith('.csv') and 'baobei' in name:
1239
- date = re.findall(r's-(\d{4})-(\d{2})-(\d{2})\.', str(name))
1240
- if not date: # 阻止月数据及未转换的表格
1241
- continue
1242
- t_path = str(pathlib.Path(self.source_path, '生意经/宝贝指标'))
1243
- bib(t_path, _as_month=True)
1244
- elif name.endswith('.csv') and '省份城市分析' in name:
1245
- date = re.findall(r'(\d{4})-(\d{2})-(\d{2})\.', str(name))
1246
- if not date: # 阻止未转换的表格
1247
- continue
1248
- t_path = str(pathlib.Path(self.source_path, '生意经/地域分布'))
1249
- bib(t_path, _as_month=True)
1250
- elif name.endswith('.csv') and '店铺销售指标' in name:
1251
- date = re.findall(r'(\d{4})-(\d{2})-(\d{2})\.', str(name))
1252
- if not date: # 阻止未转换的表格
1253
- continue
1254
- t_path = str(pathlib.Path(self.source_path, '生意经/店铺指标'))
1255
- bib(t_path, _as_month=False)
1256
- elif name.endswith('.csv') and 'order' in name:
1257
- date = re.findall(r'(\d{4})-(\d{2})-(\d{2})\.', str(name))
1258
- if not date: # 阻止未转换的表格
1259
- continue
1260
- t_path = str(pathlib.Path(self.source_path, '生意经/订单数据'))
1261
- bib(t_path, _as_month=False)
1262
- elif name.endswith('.csv') and '直播间成交订单明细' in name:
1263
- t_path = str(pathlib.Path(self.source_path, '生意参谋/直播订单明细'))
1264
- bib(t_path, _as_month=True)
1265
- elif name.endswith('.csv') and '直播间大盘数据' in name:
1266
- t_path = str(pathlib.Path(self.source_path, '生意参谋/直播间大盘数据'))
1267
- bib(t_path, _as_month=True)
1268
- elif name.endswith('.csv') and '直播业绩_成交拆解' in name:
1269
- t_path = str(pathlib.Path(self.source_path, '生意参谋/直播业绩_成交拆解'))
1270
- bib(t_path, _as_month=True)
1271
- elif name.endswith('.csv') and 'DMP报表' in name:
1272
- t_path = str(pathlib.Path(self.source_path, '推广报表/DMP报表'))
1273
- bib(t_path, _as_month=True)
1274
- elif name.endswith('.csv') and '人群洞察' in name:
1275
- t_path = str(pathlib.Path(self.source_path, '推广报表/人群洞察'))
1276
- bib(t_path, _as_month=True)
1277
- elif name.endswith('.csv') and '客户_客户概况_画像' in name:
1278
- t_path = str(pathlib.Path(self.source_path, '生意参谋/客户_客户概况_画像'))
1279
- bib(t_path, _as_month=True)
1280
- elif name.endswith('.csv') and '市场排行_店铺' in name:
1281
- t_path = str(pathlib.Path(self.source_path, '市场数据/市场排行'))
1282
- bib(t_path, _as_month=True)
1283
- elif name.endswith('.csv') and '淘宝店铺数据' in name:
1284
- t_path = str(pathlib.Path(self.source_path, '市场数据/其他数据'))
1285
- bib(t_path, _as_month=False)
1286
- elif name.endswith('.csv') and '零售明细统计' in name:
1287
- t_path = str(pathlib.Path(self.source_path, '生意经/E3零售明细统计'))
1288
- bib(t_path, _as_month=True)
1289
- elif name.endswith('.csv') and '客户运营平台_客户列表' in name:
1290
- t_path = str(pathlib.Path(self.source_path, '生意参谋/客户运营平台'))
1291
- bib(t_path, _as_month=True)
1292
- elif name.endswith('.csv') and '直播分场次效果' in name:
1293
- pattern = re.findall(r'(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})', name)
1294
- if not pattern:
1295
- continue
1296
- t_path = str(pathlib.Path(self.source_path, '生意参谋/直播场次分析'))
1297
- bib(t_path, _as_month=True)
1298
- # 京东分界线 ------- 开始标记
1299
- # 京东分界线
1300
- elif name.endswith('.csv') and '全部渠道_商品明细' in name:
1301
- if 'sku' in name:
1302
- t_path = str(pathlib.Path(self.source_path, '京东报表/JD商品明细sku'))
1303
- elif 'spu' in name:
1304
- t_path = str(pathlib.Path(self.source_path, '京东报表/JD商品明细spu'))
1305
- else:
1306
- t_path = str(pathlib.Path(self.source_path, '京东报表/未找到分类数据'))
1307
- bib(t_path, _as_month=True)
1308
- elif name.endswith('.csv') and '竞店概况_竞店详情' in name:
1309
- t_path = str(pathlib.Path(self.source_path, '京东报表/JD竞店监控数据'))
1310
- bib(t_path, _as_month=True)
1311
- elif name.endswith('.csv') and '京东推广搜索词' in name:
1312
- t_path = str(pathlib.Path(self.source_path, '京东报表/JD推广搜索词报表'))
1313
- bib(t_path, _as_month=True)
1314
- elif name.endswith('.csv') and '京东点击成交报表' in name:
1315
- t_path = str(pathlib.Path(self.source_path, '京东报表/JD推广报表'))
1316
- bib(t_path, _as_month=True)
1317
- elif name.endswith('.csv') and '搜索分析-排名定位-商品词下排名' in name:
1318
- t_path = str(pathlib.Path(self.source_path, '京东报表/JD排名定位/商品词下排名'))
1319
- bib(t_path, _as_month=True)
1320
- elif name.endswith('.csv') and '搜索分析-排名定位-商品排名' in name:
1321
- t_path = str(pathlib.Path(self.source_path, '京东报表/JD排名定位/商品排名'))
1322
- bib(t_path, _as_month=True)
1323
- elif name.endswith('.csv') and '按天_店铺来源_流量来源' in name:
1324
- t_path = str(pathlib.Path(self.source_path, '京东报表/JD流量来源'))
1325
- bib(t_path, _as_month=True)
1326
- elif name.endswith('.csv') and 'JD店铺日报' in name:
1327
- t_path = str(pathlib.Path(self.source_path, '京东报表/JD店铺日报'))
1328
- bib(t_path, _as_month=True)
1329
- elif name.endswith('.csv') and '商家榜单_女包_整体' in name:
1330
- t_path = str(pathlib.Path(self.source_path, '京东报表/JD商家榜单'))
1331
- bib(t_path, _as_month=True)
1332
- elif name.endswith('.csv') and '导出-批量任务' in name:
1333
- if 'SKU' in name:
1334
- t_path = str(pathlib.Path(self.source_path, '京东报表/商品信息导出/sku'))
1335
- bib(t_path, _as_month=False)
1336
- elif 'SPU' in name:
1337
- t_path = str(pathlib.Path(self.source_path, '京东报表/商品信息导出/spu'))
1338
- bib(t_path, _as_month=False)
1339
- elif name.endswith('.csv') and '_行业分析_竞争分析' in name:
1340
- t_path = str(pathlib.Path(self.source_path, '京东报表/行业竞争分析'))
1341
- bib(t_path, _as_month=True)
1342
- elif name.endswith('.csv') and '付费广告_行业分析_行业大盘' in name:
1343
- t_path = str(pathlib.Path(self.source_path, '京东报表/行业大盘_流量排行'))
1344
- bib(t_path, _as_month=False)
1345
- elif name.endswith('.csv') and '营销概况_全站营销' in name:
1346
- t_path = str(pathlib.Path(self.source_path, '京东报表/JD推广_全站营销报表'))
1347
- bib(t_path, _as_month=True)
1348
- elif name.endswith('.csv') and '京东推广关键词点击成交报表' in name:
1349
- t_path = str(pathlib.Path(self.source_path, '京东报表/JD推广_关键词报表'))
1350
- bib(t_path, _as_month=True)
1351
- elif name.endswith('.csv') and '爱库存_商品榜单_spu_' in name:
1352
- t_path = str(pathlib.Path(self.source_path, '爱库存/商品榜单'))
1353
- bib(t_path, _as_month=True)
1354
- # 京东分界线 ------- 结束标记
1355
-
1356
- def attribute(self, path=None, _str='商品素材导出', ):
1357
- """
1358
- 从天猫商品素材库中下载的文件,将文件修改日期添加到DF 和文件名中
1359
- """
1360
- db_name = '属性设置2'
1361
- collection_name = '商品素材导出'
1362
- if not path:
1363
- path = self.path
1364
-
1365
- # if self.set_up_to_mogo:
1366
- # username, password, host, port = get_myconf.select_config_values(target_service='home_lx',
1367
- # database='mongodb')
1368
- # d = mongo.UploadMongo(username=username, password=password, host=host, port=port,
1369
- # drop_duplicates=False
1370
- # )
1371
- # if self.set_up_to_mysql:
1372
- # username, password, host, port = get_myconf.select_config_values(target_service='home_lx', database='mysql')
1373
- # m = mysql.MysqlUpload(username=username, password=password, host=host, port=port)
1374
- new_save_path = os.path.join(self.source_path, '属性设置', '商品素材')
1375
- for root, dirs, files in os.walk(path, topdown=False):
1376
- for name in files:
1377
- if name.endswith('.xlsx') and '~' not in name:
1378
- pattern = re.findall('([\u4e00-\u9fa5])', name)
1379
- if pattern:
1380
- continue
1381
- if '~$' in name or 'DS_Store' in name:
1382
- continue
1383
- df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
1384
- col = df.columns.tolist()
1385
- if '商品白底图' in col and '方版场景图' in col:
1386
- f_info = os.stat(os.path.join(root, name)) # 读取文件的 stat 信息
1387
- mtime = time.strftime('%Y-%m-%d', time.localtime(f_info.st_mtime)) # 读取文件创建日期
1388
- df['日期'] = mtime
1389
- df.rename(columns={'商品ID': '商品id'}, inplace=True)
1390
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
1391
- if (652737455554 in df['商品id'].tolist()
1392
- or 683449516249 in df['商品id'].tolist()
1393
- or 37114359548 in df['商品id'].tolist()
1394
- or 570735930393 in df['商品id'].tolist()):
1395
- df.insert(0, '店铺名称', '万里马官方旗舰店') # 插入新列
1396
- new_name = 'tm_' + os.path.splitext(name)[0] + f'_{_str}_' + mtime + '.csv'
1397
- elif (704624764420 in df['商品id'].tolist()
1398
- or 701781021639 in df['商品id'].tolist()
1399
- or 520380314717 in df['商品id'].tolist()):
1400
- df.insert(0, '店铺名称', '万里马官方企业店') # 插入新列
1401
- new_name = 'tb_' + os.path.splitext(name)[0] + f'_{_str}_' + mtime + '.csv'
1402
- else:
1403
- df.insert(0, '店铺名称', 'coome旗舰店') # 插入新列
1404
- new_name = 'coome_' + os.path.splitext(name)[0] + f'_{_str}_' + mtime + '.csv'
1405
- df['商品id'] = df['商品id'].apply(
1406
- lambda x: "{0}{1}{2}".format('="', x, '"') if x and '=' not in str(x) else x
1407
- )
1408
- # mysql 可能改变 df 列名,所以在上传 mysql 前保存 csv
1409
- self.save_to_csv(df, new_save_path, new_name, encoding='utf-8_sig')
1410
- # try:
1411
- # if self.set_up_to_mogo:
1412
- # d.df_to_mongo(df=df, db_name=db_name, collection_name=collection_name)
1413
- # if self.set_up_to_mysql:
1414
- # m.df_to_mysql(df=df, db_name=db_name, tabel_name=collection_name)
1415
- # except Exception as e:
1416
- # print(e)
1417
- os.remove(os.path.join(root, name))
1418
- # if self.set_up_to_mogo:
1419
- # if d.client:
1420
- # d.client.close() # 必须手动关闭数据库连接
1421
-
1422
- # @try_except
1423
- def new_unzip(self, path=None, is_move=None):
1424
- """
1425
- {解压并移除zip文件}
1426
- 如果是京东的商品明细,处理过程:
1427
- 1. 读取 zip包的文件名
1428
- 2. 组合完整路径,判断文件夹下是否已经有同名文件
1429
- 3. 如果有,则将该同名文件改名,(从文件名中提取日期,重新拼接文件名)
1430
- 4. 然后解压 zip包
1431
- 5. 需要用 _jd_rename 继续重命名刚解压的文件
1432
- is_move 参数, 是否移除 下载目录的所有zip 文件
1433
- """
1434
- if not path:
1435
- path = self.path
1436
- res_names = [] # 需要移除的压缩文件
1437
- for root, dirs, files in os.walk(path, topdown=False):
1438
- for name in files:
1439
- if '~$' in name or 'DS_Store' in name:
1440
- continue
1441
- if name.endswith('.zip'):
1442
- old_file = os.path.join(root, name)
1443
- f = zipfile.ZipFile(old_file, 'r')
1444
- if len(f.namelist()) == 1: # 压缩包只有一个文件的情况
1445
- for zip_name in f.namelist(): # 读取zip内的文件名称
1446
- # zip_name_1 = zip_name.encode('cp437').decode('utf-8')
1447
- try:
1448
- zip_name_1 = zip_name.encode('utf-8').decode('utf-8')
1449
- except:
1450
- zip_name_1 = zip_name.encode('cp437').decode('utf-8')
1451
- new_path = os.path.join(root, zip_name_1) # 拼接解压后的文件路径
1452
- if os.path.isfile(new_path) and '全部渠道_商品明细' in new_path: # 是否存在和包内同名的文件
1453
- # 专门处理京东文件
1454
- df = pd.read_excel(new_path)
1455
- try:
1456
- pattern1 = re.findall(r'\d{8}_(\d{4})(\d{2})(\d{2})_全部渠道_商品明细',
1457
- name)
1458
- pattern2 = re.findall(
1459
- r'\d{8}_(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})_全部渠道_商品明细',
1460
- name)
1461
- if pattern1:
1462
- year_date = '-'.join(list(pattern1[0])) + '_' + '-'.join(list(pattern1[0]))
1463
- elif pattern2:
1464
- year_date = '-'.join(list(pattern2[0])[0:3]) + '_' + '-'.join(
1465
- list(pattern2[0])[3:7])
1466
- else:
1467
- year_date = '无法提取日期'
1468
- print(f'{name} 无法从文件名中提取日期,请检查pattern或文件')
1469
- if ('10035975359247' in df['商品ID'].values or '10056642622343' in
1470
- df['商品ID'].values):
1471
- os.rename(new_path,
1472
- os.path.join(root, 'sku_' + year_date + '_全部渠道_商品明细.xls'))
1473
- f.extract(zip_name_1, root)
1474
- elif ('10021440233518' in df['商品ID'].values or '10022867813485' in
1475
- df['商品ID'].values):
1476
- os.rename(new_path,
1477
- os.path.join(root, 'spu_' + year_date + '_全部渠道_商品明细.xls'))
1478
- f.extract(zip_name_1, root)
1479
- if is_move:
1480
- os.remove(os.path.join(root, name))
1481
- except Exception as e:
1482
- print(e)
1483
- continue
1484
- else:
1485
- f.extract(zip_name, root)
1486
- if zip_name_1 != zip_name:
1487
- os.rename(os.path.join(root, zip_name), os.path.join(root, zip_name_1))
1488
- if is_move:
1489
- res_names.append(name)
1490
- # os.remove(os.path.join(root, name)) # 这里不能移除,会提示文件被占用
1491
- f.close()
1492
- else: # 压缩包内包含多个文件的情况
1493
- f.close()
1494
- self.unzip_all(path=old_file, save_path=path)
1495
-
1496
- if is_move:
1497
- for name in res_names:
1498
- os.remove(os.path.join(path, name))
1499
- print(f'移除{os.path.join(path, name)}')
1500
-
1501
- @staticmethod
1502
- def unzip_all(path, save_path):
1503
- """
1504
- 遍历目录, 重命名有乱码的文件
1505
- 2. 如果压缩包是文件夹, 则保存到新文件夹,并删除有乱码的文件夹
1506
- 3. 删除MAC系统的临时文件夹__MACOSX
1507
- """
1508
- with PyZipFile(path) as _f:
1509
- _f.extractall(save_path)
1510
- _f.close()
1511
- for _root, _dirs, _files in os.walk(save_path, topdown=False):
1512
- for _name in _files:
1513
- if '~$' in _name or 'DS_Store' in _name:
1514
- continue
1515
- try:
1516
- _new_root = _root.encode('cp437').decode('utf-8')
1517
- _new_name = _name.encode('cp437').decode('utf-8')
1518
- except:
1519
- _new_root = _root.encode('utf-8').decode('utf-8')
1520
- _new_name = _name.encode('utf-8').decode('utf-8')
1521
- _old = os.path.join(_root, _name)
1522
- _new = os.path.join(_new_root, _new_name)
1523
- if _new_root != _root: # 目录乱码,创建新目录
1524
- os.makedirs(_new_root, exist_ok=True)
1525
- os.rename(_old, _new)
1526
- try:
1527
- _new_root = _root.encode('cp437').decode('utf-8')
1528
- except:
1529
- _new_root = _root.encode('utf-8').decode('utf-8')
1530
- if _new_root != _root or '__MACOSX' in _root:
1531
- shutil.rmtree(_root)
1532
-
1533
-
1534
- def main():
1535
- # 数据分类
1536
-
1537
- d_path = '/Users/xigua/Downloads'
1538
- source_path = '/Users/xigua/数据中心/原始文件2'
1539
- c = DataClean(path=d_path, source_path=source_path)
1540
- c.set_up_to_mogo = False
1541
- c.set_up_to_mysql = False
1542
- c.new_unzip(is_move=True) # 解压文件
1543
- c.change_and_sort()
1544
- c.move_all() # 移到文件到原始文件夹
1545
- # c.attribute() # 商品素材重命名和分类
1546
-
1547
-
1548
- if __name__ == '__main__':
1549
- main()
1550
- username, password, host, port = get_myconf.select_config_values(target_service='aliyun', database='mongodb')
1551
- print(username, password, host, port)