mdbq 3.3.5__py3-none-any.whl → 3.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1350 +0,0 @@
1
- # -*- coding:utf-8 -*-
2
- import warnings
3
- import pandas as pd
4
- from functools import wraps
5
- import chardet
6
- import zipfile
7
- import socket
8
- from pyzipper import PyZipFile
9
- import os
10
- import platform
11
- import json
12
- from mdbq.mongo import mongo
13
- from mdbq.mysql import mysql
14
- from mdbq.config import myconfig
15
- from mdbq.aggregation import df_types
16
- from mdbq.config import products
17
- from mdbq.aggregation import optimize_data
18
- from mdbq.aggregation import query_data
19
- import datetime
20
- import time
21
- import re
22
- import shutil
23
- import getpass
24
-
25
- warnings.filterwarnings('ignore')
26
-
27
-
28
- if platform.system() == 'Windows':
29
- # windows版本
30
- Data_Path = r'C:\同步空间\BaiduSyncdisk'
31
- D_PATH = os.path.join(f'C:\\Users\\{getpass.getuser()}\\Downloads')
32
- Share_Path = os.path.join(r'\\192.168.1.198\时尚事业部\01.运营部\天猫报表') # 共享文件根目录
33
- elif platform.system() == 'Linux':
34
- Data_Path = '数据中心'
35
- D_PATH = 'Downloads'
36
- if not os.path.exists(D_PATH):
37
- os.makedirs(D_PATH)
38
- Share_Path = '' # linux 通常是远程服务器,不需要访问共享
39
- else:
40
- Data_Path = f'/Users/{getpass.getuser()}/数据中心' # 使用Mac独立网络时
41
- D_PATH = os.path.join(f'/Users/{getpass.getuser()}/Downloads')
42
- Share_Path = os.path.join('/Volumes/时尚事业部/01.运营部/天猫报表') # 共享文件根目录
43
-
44
- upload_path = os.path.join(D_PATH, '数据上传中心') # 此目录位于下载文件夹
45
- # source_path = os.path.join(Data_Path, '原始文件2') # 此目录保存下载并清洗过的文件,作为数据库备份
46
- source_path3 = os.path.join(Data_Path, '原始文件3') # 此目录保存下载并清洗过的文件,作为数据库备份
47
-
48
- username, password, host, port, service_database = None, None, None, None, None,
49
- if socket.gethostname() in ['xigua_lx', 'xigua1', 'MacBookPro']:
50
- conf = myconfig.main()
51
- conf_data = conf['Windows']['xigua_lx']['mysql']['local']
52
- username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data['port']
53
- service_database = {'xigua_lx': 'mysql'}
54
- elif socket.gethostname() in ['company', 'Mac2.local']:
55
- conf = myconfig.main()
56
- conf_data = conf['Windows']['company']['mysql']['local']
57
- username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data['port']
58
- service_database = {'company': 'mysql'}
59
- if not username:
60
- print(f'找不到主机:')
61
-
62
-
63
-
64
- class DataClean:
65
- """ 数据分类 """
66
-
67
- def __init__(self, path, source_path):
68
- self.path = path # 数据源位置,下载文件夹
69
- self.source_path = source_path # 原始文件保存目录
70
- self.datas = []
71
-
72
- @staticmethod
73
- def try_except(func): # 在类内部定义一个异常处理方法
74
- @wraps(func)
75
- def wrapper(*args, **kwargs):
76
- try:
77
- return func(*args, **kwargs)
78
- except Exception as e:
79
- print(f'{func.__name__}, {e}') # 将异常信息返回
80
-
81
- return wrapper
82
-
83
- @staticmethod
84
- def get_encoding(file_path):
85
- """
86
- 获取文件的编码方式, 读取速度比较慢,非必要不要使用
87
- """
88
- with open(file_path, 'rb') as f:
89
- f1 = f.read()
90
- encod = chardet.detect(f1).get('encoding')
91
- return encod
92
-
93
- @staticmethod
94
- def save_to_csv(_df, _save_paths, filenames, encoding='utf-8_sig'):
95
- if '.csv' not in filenames:
96
- filenames = f'{filenames}.csv'
97
- if not os.path.exists(_save_paths):
98
- os.makedirs(_save_paths, exist_ok=True)
99
- _df.to_csv(os.path.join(_save_paths, filenames), encoding=encoding, index=False, header=True)
100
-
101
- def sycm_tm(self, path=None, is_except=[]):
102
- """ 天猫 生意参谋数据 """
103
- if not path:
104
- path = self.path
105
- report_names = [
106
- {
107
- '文件简称': '商品排行_', # 文件名中包含的字符
108
- '数据库名': '生意参谋3',
109
- '集合名称': '商品排行',
110
- },
111
- {
112
- '文件简称': '店铺来源_来源构成_', # 文件名中包含的字符
113
- '数据库名': '生意参谋3',
114
- '集合名称': '店铺流量来源构成',
115
- },
116
- {
117
- '文件简称': '爱库存_商品榜单_', # 文件名中包含的字符
118
- '数据库名': '爱库存2',
119
- '集合名称': '商品spu榜单',
120
- },
121
- {
122
- '文件简称': '手淘搜索_本店引流词_', # 文件名中包含的字符
123
- '数据库名': '生意参谋3',
124
- '集合名称': '手淘搜索_本店引流词',
125
- },
126
- {
127
- '文件简称': '直播分场次效果_', # 文件名中包含的字符
128
- '数据库名': '生意参谋3',
129
- '集合名称': '直播分场次效果',
130
- },
131
- {
132
- '文件简称': 'crm_客户列表_', # 文件名中包含的字符
133
- '数据库名': '生意参谋3',
134
- '集合名称': 'crm成交客户',
135
- },
136
- ]
137
- for root, dirs, files in os.walk(path, topdown=False):
138
- for name in files:
139
- if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
140
- continue
141
- is_continue = False
142
- if is_except:
143
- for item in is_except:
144
- if item in os.path.join(root, name):
145
- # print(name)
146
- is_continue = True
147
- break
148
- if is_continue: # 需要排除不做处理的文件或文件夹
149
- continue
150
-
151
- # 这里排除掉非目标报表
152
- is_continue = False
153
- db_name = None # 初始化参数
154
- collection_name = None
155
- for item in report_names:
156
- if item['文件简称'] in name:
157
- db_name = item['数据库名']
158
- collection_name = item['集合名称']
159
- is_continue = True
160
- if not is_continue:
161
- continue
162
- if name.endswith('.csv') and '商品排行_' in name:
163
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
164
- # df = pd.read_excel(os.path.join(root, name), header=4)
165
-
166
- elif name.endswith('.csv') and '手淘搜索_本店引流词_' in name:
167
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
168
- # df = pd.read_excel(os.path.join(root, name), header=5, engine='xlrd')
169
-
170
- elif name.endswith('.csv') and '_来源构成_' in name:
171
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
172
-
173
- elif name.endswith('.csv') and '爱库存_商品榜单_' in name:
174
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
175
- if '店铺名称' not in df.columns.tolist():
176
- df.insert(loc=1, column='店铺名称', value='爱库存平台') # df中插入新列
177
- new_name = f'py_xg_{os.path.splitext(name)[0]}.csv'
178
- self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
179
- os.remove(os.path.join(root, name))
180
- elif name.endswith('.csv') and '直播分场次效果' in name:
181
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
182
- # shop_name = re.findall(r'_([\u4e00-\u9fffA-Za-z]+店)_', name)[0]
183
- # if '店铺名称' not in df.columns.tolist():
184
- # df.insert(loc=1, column='店铺名称', value=shop_name)
185
- # new_name = f'py_xg_{os.path.splitext(name)[0]}.csv'
186
- # self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
187
- # os.remove(os.path.join(root, name))
188
- elif name.endswith('.csv') and 'crm_客户列表' in name:
189
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
190
-
191
- # 将数据传入 self.datas 等待更新进数据库
192
- if not db_name or not collection_name:
193
- # print(f'db_name/collection_name 不能为空')
194
- continue
195
- self.datas.append(
196
- {
197
- '数据库名': db_name,
198
- '集合名称': collection_name,
199
- '数据主体': df,
200
- '文件名': name,
201
- }
202
- )
203
-
204
- def dmp_tm(self, path=None, is_except=[]):
205
- """ 天猫 达摩盘 """
206
- if not path:
207
- path = self.path
208
- report_names = [
209
- {
210
- '文件简称': '我的人群属性', # 文件名中包含的字符
211
- '数据库名': '达摩盘3',
212
- '集合名称': '我的人群属性',
213
- },
214
- {
215
- '文件简称': 'dmp人群报表_', # 文件名中包含的字符
216
- '数据库名': '达摩盘3',
217
- '集合名称': 'dmp人群报表',
218
- },
219
- {
220
- '文件简称': '货品洞察_全店单品', # 文件名中包含的字符
221
- '数据库名': '达摩盘3',
222
- '集合名称': '货品洞察_全店单品',
223
- },
224
- ]
225
- for root, dirs, files in os.walk(path, topdown=False):
226
- for name in files:
227
- if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
228
- continue
229
- is_continue = False
230
- if is_except:
231
- for item in is_except:
232
- if item in os.path.join(root, name):
233
- # print(name)
234
- is_continue = True
235
- break
236
- if is_continue: # 需要排除不做处理的文件或文件夹
237
- continue
238
-
239
- # 这里排除掉非目标报表
240
- is_continue = False
241
- db_name = None # 初始化参数
242
- collection_name = None
243
- for item in report_names:
244
- if item['文件简称'] in name:
245
- db_name = item['数据库名']
246
- collection_name = item['集合名称']
247
- is_continue = True
248
- if not is_continue:
249
- continue
250
- if name.endswith('.csv') and '人群属性_万里马官方旗舰店' in name: # 推广类报表
251
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
252
- elif name.endswith('.csv') and 'dmp人群报表_' in name:
253
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
254
- elif name.endswith('.csv') and '货品洞察_全店单品' in name:
255
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
256
-
257
- # 将数据传入 self.datas 等待更新进数据库
258
- if not db_name or not collection_name:
259
- # print(f'db_name/collection_name 不能为空')
260
- continue
261
- self.datas.append(
262
- {
263
- '数据库名': db_name,
264
- '集合名称': collection_name,
265
- '数据主体': df,
266
- '文件名': name,
267
- }
268
- )
269
-
270
- def tg_reports(self, path=None, is_except=[]):
271
- """ 处理天猫淘宝推广类报表 """
272
- if not path:
273
- path = self.path
274
- report_names = [
275
- {
276
- '文件简称': 'tg_report_主体报表',
277
- '数据库名': '推广数据2',
278
- '集合名称': '主体报表',
279
- },
280
- {
281
- '文件简称': 'tg_report_创意报表_创意',
282
- '数据库名': '推广数据2',
283
- '集合名称': '创意报表_创意',
284
- },
285
- {
286
- '文件简称': 'tg_report_创意报表_素材',
287
- '数据库名': '推广数据2',
288
- '集合名称': '创意报表_素材',
289
- },
290
- {
291
- '文件简称': 'tg_report_单元报表',
292
- '数据库名': '推广数据2',
293
- '集合名称': '单元报表',
294
- },
295
- {
296
- '文件简称': 'tg_report_地域报表_省份',
297
- '数据库名': '推广数据2',
298
- '集合名称': '地域报表_省份',
299
- },
300
- {
301
- '文件简称': 'tg_report_地域报表_城市',
302
- '数据库名': '推广数据2',
303
- '集合名称': '地域报表_城市',
304
- },
305
- {
306
- '文件简称': 'tg_report_关键词报表',
307
- '数据库名': '推广数据2',
308
- '集合名称': '关键词报表',
309
- },
310
- {
311
- '文件简称': 'tg_report_计划报表',
312
- '数据库名': '推广数据2',
313
- '集合名称': '计划报表',
314
- },
315
- {
316
- '文件简称': 'tg_report_权益报表',
317
- '数据库名': '推广数据2',
318
- '集合名称': '权益报表',
319
- },
320
- {
321
- '文件简称': 'tg_report_人群报表',
322
- '数据库名': '推广数据2',
323
- '集合名称': '人群报表',
324
- },
325
- {
326
- '文件简称': 'tg_report_营销场景报表',
327
- '数据库名': '推广数据2',
328
- '集合名称': '营销场景报表',
329
- },
330
- {
331
- '文件简称': 'tg_report_超级直播报表_人群',
332
- '数据库名': '推广数据2',
333
- '集合名称': '超级直播',
334
- },
335
- {
336
- '文件简称': 'tg_report_品销宝_明星店铺',
337
- '数据库名': '推广数据2',
338
- '集合名称': '品销宝',
339
- },
340
- {
341
- '文件简称': 'tg_report_超级短视频_主体',
342
- '数据库名': '推广数据2',
343
- '集合名称': '超级短视频_主体',
344
- }
345
- ]
346
- for root, dirs, files in os.walk(path, topdown=False):
347
- for name in files:
348
- if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
349
- continue
350
- # if 'py_xg' in name:
351
- # continue
352
- is_continue = False
353
- if is_except:
354
- for item in is_except:
355
- if item in os.path.join(root, name):
356
- # print(name)
357
- is_continue = True
358
- break
359
- if is_continue: # 需要排除不做处理的文件或文件夹
360
- continue
361
-
362
- # 这里排除掉非推广类报表
363
- is_continue = False
364
- db_name = None # 初始化参数
365
- collection_name = None
366
- for item in report_names:
367
- if item['文件简称'] in name:
368
- db_name = item['数据库名']
369
- collection_name = item['集合名称']
370
- is_continue = True
371
- if not is_continue:
372
- continue
373
- # 区分淘宝和天猫的报表
374
- if '万里马官方旗舰店' in name:
375
- pass
376
- elif '万里马官方企业店' in name:
377
- db_name = '推广数据_淘宝店'
378
- else:
379
- print(f'报表名称错误,不属于天猫/淘宝店:{name}')
380
- continue
381
-
382
- if name.endswith('.csv') and '明星店铺' not in name: # 推广类报表
383
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
384
- elif name.endswith('.csv') and '品销宝_明星店铺' in name:
385
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
386
- elif name.endswith('.xlsx') and '品销宝_明星店铺' in name:
387
- # 品销宝
388
- sheets4 = ['账户', '推广计划', '推广单元', '创意', '品牌流量包', '定向人群'] # 品销宝
389
- file_name4 = os.path.splitext(name)[0] # 明星店铺报表
390
- new_df = []
391
- for sheet4 in sheets4:
392
- df = pd.read_excel(os.path.join(root, name), sheet_name=sheet4, header=0, engine='openpyxl')
393
- if len(df) == 0:
394
- print(f'{name} 报表数据为空')
395
- os.remove(os.path.join(root, name))
396
- continue
397
- if len(df) < 1:
398
- print(f'{name} 跳过')
399
- continue
400
- else:
401
- shop_name = re.findall(r'明星店铺_([\u4e00-\u9fffA-Za-z]+店)', name)[0]
402
- df.insert(loc=1, column='店铺名称', value=shop_name)
403
- df.insert(loc=2, column='报表类型', value=sheet4)
404
- # if '访客触达率' not in df.columns.tolist():
405
- # df['访客触达率'] = '0'
406
- df.fillna(0, inplace=True)
407
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore') # 转换日期列
408
- # min_clm = str(df['日期'].min()).split(' ')[0]
409
- # max_clm = str(df['日期'].max()).split(' ')[0]
410
- new_file_name4 = f'{sheet4}_py_xg_{file_name4}.csv'
411
- # 以sheet名进一步创建子文件夹
412
- # root_new = os.path.join(self.source_path, '推广报表/品销宝', sheet4)
413
- self.save_to_csv(df, upload_path, new_file_name4)
414
- new_df.append(df)
415
- df = pd.concat(new_df) # 品销宝 1 表有 6 个 sheet
416
- os.remove(os.path.join(root, name))
417
-
418
- # 将数据传入 self.datas 等待更新进数据库
419
- if not db_name or not collection_name:
420
- print(f'db_name/collection_name 不能为空')
421
- continue
422
- # print(db_name, collection_name)
423
- self.datas.append(
424
- {
425
- '数据库名': db_name,
426
- '集合名称': collection_name,
427
- '数据主体': df,
428
- '文件名': name,
429
- }
430
- )
431
-
432
- def syj_reports_tm(self, path=None, is_except=[]):
433
- """ 生意经报表 """
434
- if not path:
435
- path = self.path
436
- report_names = [
437
- {
438
- '文件简称': 'baobei',
439
- '数据库名': '生意经3',
440
- '集合名称': '宝贝指标',
441
- },
442
- {
443
- '文件简称': 'order',
444
- '数据库名': '生意经3',
445
- '集合名称': '订单数据',
446
- },
447
- {
448
- '文件简称': '省份城市分析',
449
- '数据库名': '生意经3',
450
- '集合名称': '省份城市分析',
451
- },
452
- {
453
- '文件简称': '店铺销售指标',
454
- '数据库名': '生意经3',
455
- '集合名称': '店铺销售指标',
456
- },
457
- ]
458
-
459
- for root, dirs, files in os.walk(path, topdown=False):
460
- for name in files:
461
- if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
462
- continue
463
- # if 'py_xg' in name:
464
- # continue
465
- is_continue = False
466
- if is_except:
467
- for item in is_except:
468
- if item in os.path.join(root, name):
469
- # print(name)
470
- is_continue = True
471
- break
472
- if is_continue: # 需要排除不做处理的文件或文件夹
473
- continue
474
-
475
- # 这里排除掉非目标报表
476
- is_continue = False
477
- db_name = None # 初始化参数
478
- collection_name = None
479
- for item in report_names:
480
- if item['文件简称'] in name:
481
- db_name = item['数据库名']
482
- collection_name = item['集合名称']
483
- is_continue = True
484
- if not is_continue:
485
- continue
486
-
487
- if name.endswith('.csv') and 'baobei' in name:
488
- # encoding = self.get_encoding(file_path=os.path.join(root, name))
489
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
490
- elif name.endswith('.csv') and 'order' in name:
491
- """ 如果是手动下载的表格,这里不能使用表格原先的 gb2312, 会报错 """
492
- # df = pd.read_csv(os.path.join(root, name), encoding='gb18030', header=0, na_filter=False)
493
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
494
- elif name.endswith('.csv') and '省份城市分析' in name:
495
- encoding = self.get_encoding(file_path=os.path.join(root, name))
496
- df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
497
- pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})\W', name)[0] # 注意后面可能有小括号 ...27 (2).csv
498
- date = '-'.join(pattern[1:])
499
- new_name = f'py_xg_天猫_{pattern[0]}-{date}.csv'
500
- df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
501
- if len(df) == 0:
502
- print(f'{name} 报表数据为空')
503
- os.remove(os.path.join(root, name))
504
- continue
505
- df['省'] = df['省份'].apply(lambda x: x if ' ├─ ' not in x and ' └─ ' not in x else None)
506
- df['城市'] = df[['省份', '省']].apply(lambda x: '汇总' if x['省'] else x['省份'], axis=1)
507
- df['省'].fillna(method='ffill', inplace=True)
508
- df['城市'].replace(to_replace=[' ├─ | └─ '], value='', regex=True, inplace=True)
509
- pov = df.pop('省')
510
- city = df.pop('城市')
511
- df['省+市'] = df['省份']
512
- df['省份'] = pov
513
- df.insert(loc=1, column='城市', value=city)
514
- df.insert(loc=0, column='日期', value=date)
515
- df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
516
- self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
517
- os.remove(os.path.join(root, name))
518
- elif name.endswith('.csv') and '店铺销售指标' in name:
519
- # 生意经, 店铺指标,仅限月数据,实际日指标也可以
520
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
521
-
522
- # 将数据传入 self.datas 等待更新进数据库
523
- if not db_name or not collection_name:
524
- # print(f'db_name/collection_name 不能为空')
525
- continue
526
- self.datas.append(
527
- {
528
- '数据库名': db_name,
529
- '集合名称': collection_name,
530
- '数据主体': df,
531
- '文件名': name,
532
- }
533
- )
534
-
535
- def jd_reports(self, path=None, is_except=[]):
536
- """ 处理京东报表 """
537
- if not path:
538
- path = self.path
539
- report_names = [
540
- {
541
- '文件简称': '京东推广_点击成交',
542
- '数据库名': '京东数据3',
543
- '集合名称': '推广数据_京准通',
544
- },
545
- {
546
- '文件简称': '京东推广_搜索词',
547
- '数据库名': '京东数据3',
548
- '集合名称': '推广数据_搜索词报表',
549
- },
550
- {
551
- '文件简称': '京东推广_关键词',
552
- '数据库名': '京东数据3',
553
- '集合名称': '推广数据_关键词报表',
554
- },
555
- {
556
- '文件简称': 'sku_商品明细',
557
- '数据库名': '京东数据3',
558
- '集合名称': '京东商智_sku_商品明细',
559
- },
560
- {
561
- '文件简称': 'spu_商品明细',
562
- '数据库名': '京东数据3',
563
- '集合名称': '京东商智_spu_商品明细',
564
- },
565
- {
566
- '文件简称': '店铺来源_三级来源',
567
- '数据库名': '京东数据3',
568
- '集合名称': '京东商智_店铺来源',
569
- },
570
- ]
571
-
572
- for root, dirs, files in os.walk(path, topdown=False):
573
- for name in files:
574
- if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
575
- continue
576
- # if 'py_xg' in name:
577
- # continue
578
- is_continue = False
579
- if is_except:
580
- for item in is_except:
581
- if item in os.path.join(root, name):
582
- # print(name)
583
- is_continue = True
584
- break
585
- if is_continue: # 需要排除不做处理的文件或文件夹
586
- continue
587
-
588
- # 这里排除掉非目标报表
589
- is_continue = False
590
- db_name = None # 初始化参数
591
- collection_name = None
592
- for item in report_names:
593
- if item['文件简称'] in name:
594
- db_name = item['数据库名']
595
- collection_name = item['集合名称']
596
- is_continue = True
597
- if not is_continue:
598
- continue
599
-
600
- if name.endswith('.csv') and '京东推广_' in name:
601
- # df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
602
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
603
- # new_name = f'py_xg_{name}'
604
- # if os.path.isfile(os.path.join(root, new_name)):
605
- # os.remove(os.path.join(root, new_name))
606
- # os.rename(os.path.join(root, name), os.path.join(root, new_name))
607
- elif name.endswith('.csv') and 'sku_商品明细' in name:
608
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
609
- # df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
610
- # df.replace(to_replace=['-'], value='', regex=False, inplace=True)
611
- # pattern = re.findall(r'_(\d{4}-\d{2}-\d{2})', name)[0]
612
- # df.insert(loc=0, column='日期', value=pattern)
613
- # df.insert(loc=1, column='店铺名称', value='京东箱包旗舰店')
614
- # df.fillna(0, inplace=True)
615
- # new_name = f'py_xg_{os.path.splitext(name)[0]}.csv'
616
- # df.to_csv(os.path.join(root, new_name), encoding='utf-8_sig', index=False, header=True)
617
- # # df.to_excel(os.path.join(upload_path, new_name),
618
- # # index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
619
- # os.remove(os.path.join(root, name))
620
- elif name.endswith('.csv') and 'spu_商品明细' in name:
621
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
622
- # df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
623
- # df.replace(to_replace=['-'], value='', regex=False, inplace=True)
624
- # pattern = re.findall(r'_(\d{4}-\d{2}-\d{2})', name)[0]
625
- # df.insert(loc=0, column='日期', value=pattern)
626
- # df.insert(loc=1, column='店铺名称', value='京东箱包旗舰店')
627
- # df.fillna(0, inplace=True)
628
- # new_name = f'py_xg_{os.path.splitext(name)[0]}.csv'
629
- # df.to_csv(os.path.join(root, new_name), encoding='utf-8_sig', index=False, header=True)
630
- # # df.to_excel(os.path.join(upload_path, new_name),
631
- # # index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
632
- # os.remove(os.path.join(root, name))
633
- elif name.endswith('.csv') and '店铺来源_三级来源' in name:
634
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
635
- # df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
636
- # df.replace(to_replace=['-'], value='', regex=False, inplace=True)
637
- # df.rename(columns={'时间': '日期'}, inplace=True)
638
- # for col in df.columns.tolist():
639
- # if '环比' in col or '同比' in col:
640
- # df.drop(col, axis=1, inplace=True)
641
- # df.fillna(0, inplace=True)
642
- # new_name = f'py_xg_{os.path.splitext(name)[0]}.csv'
643
- # df.to_csv(os.path.join(root, new_name), encoding='utf-8_sig', index=False, header=True)
644
- # # df.to_excel(os.path.join(upload_path, new_name),
645
- # # index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
646
- # os.remove(os.path.join(root, name))
647
-
648
- # 将数据传入 self.datas 等待更新进数据库
649
- if not db_name or not collection_name:
650
- # print(f'db_name/collection_name 不能为空')
651
- continue
652
- # print(name)
653
- self.datas.append(
654
- {
655
- '数据库名': db_name,
656
- '集合名称': collection_name,
657
- '数据主体': df,
658
- '文件名': name,
659
- }
660
- )
661
-
662
- def sp_scene_clean(self, path=None, is_except=[]):
663
- if not path:
664
- path = self.path
665
- report_names = [
666
- {
667
- '文件简称': '商品素材_', # 文件名中包含的字符
668
- '数据库名': '属性设置3',
669
- '集合名称': '商品素材中心',
670
- },
671
- {
672
- '文件简称': '商品类目属性_', # 文件名中包含的字符
673
- '数据库名': '属性设置3',
674
- '集合名称': '商品类目属性',
675
- },
676
- {
677
- '文件简称': '商品主图视频_', # 文件名中包含的字符
678
- '数据库名': '属性设置3',
679
- '集合名称': '商品主图视频',
680
- },
681
- {
682
- '文件简称': '商品sku属性_', # 文件名中包含的字符
683
- '数据库名': '属性设置3',
684
- '集合名称': '商品sku',
685
- },
686
- ]
687
-
688
- for root, dirs, files in os.walk(path, topdown=False):
689
- for name in files:
690
- if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
691
- continue
692
- if 'py_xg' in name:
693
- continue
694
- is_continue = False
695
- if is_except:
696
- for item in is_except:
697
- if item in os.path.join(root, name):
698
- # print(name)
699
- is_continue = True
700
- break
701
- if is_continue: # 需要排除不做处理的文件或文件夹
702
- continue
703
- db_name = None # 初始化参数
704
- collection_name = None
705
- for item in report_names:
706
- if item['文件简称'] in name:
707
- db_name = item['数据库名']
708
- collection_name = item['集合名称']
709
- is_continue = True
710
- if not is_continue:
711
- continue
712
-
713
- if name.endswith('.xlsx') and '商品素材_' in name:
714
- shop_name = re.findall(r'_([\u4e00-\u9fffA-Za-z]+店)_', name)[0]
715
- df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
716
- if '日期' not in df.columns.tolist():
717
- df.insert(loc=0, column='日期', value=datetime.datetime.today().strftime('%Y-%m-%d'))
718
- if '店铺名称' not in df.columns.tolist():
719
- df.insert(loc=1, column='店铺名称', value=shop_name)
720
- new_name = f'py_xg_{name}'
721
- df.to_excel(os.path.join(upload_path, new_name),
722
- index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
723
- os.remove(os.path.join(root, name))
724
- elif name.endswith('.csv') and ('商品类目属性' in name or '商品主图视频' in name or '商品sku属性' in name):
725
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
726
- new_name = f'py_xg_{os.path.splitext(name)[0]}.csv'
727
- if os.path.isfile(os.path.join(root, new_name)):
728
- os.remove(os.path.join(root, new_name))
729
- os.rename(os.path.join(root, name), os.path.join(root, new_name))
730
-
731
- # 将数据传入 self.datas 等待更新进数据库
732
- if not db_name or not collection_name:
733
- # print(f'db_name/collection_name 不能为空')
734
- continue
735
- self.datas.append(
736
- {
737
- '数据库名': db_name,
738
- '集合名称': collection_name,
739
- '数据主体': df,
740
- '文件名': name,
741
- }
742
- )
743
- """
744
- {文件分类}
745
- 将已处理完的文件 分类移到原始文件夹下
746
- 此处t_path参数定义了子文件夹的生成名称
747
- """
748
-
749
- @staticmethod
750
- def move_files(path, _name, target_path, _as_month=None):
751
- """
752
- name: 移动的文件名,
753
- target_path: 目标位置
754
- """
755
- t2 = target_path # t2 赋值有用, 不能省略
756
- if not os.path.exists(t2): # 如果目录不存在则创建
757
- os.makedirs(t2, exist_ok=True)
758
- if _as_month:
759
- _date = re.findall(r'(\d{4}-\d{2})-\d{2}', str(_name))
760
- if _date:
761
- _date = _date[0]
762
- t2 = os.path.join(t2, _date) # 添加 年月分类
763
- if not os.path.exists(t2):
764
- os.makedirs(t2, exist_ok=True)
765
- old_file = os.path.join(t2, _name) # 检查目标位置是否已经存在该文件
766
- if os.path.isfile(old_file):
767
- os.remove(old_file) # 如果存在则移除
768
- shutil.move(os.path.join(path, _name), t2) # 将文件从下载文件夹移到目标位置
769
-
770
- def move_sycm(self, path=None, is_except=[]):
771
- """ 生意参谋 """
772
- if not path:
773
- path = self.path
774
- for root, dirs, files in os.walk(path, topdown=False):
775
- for name in files:
776
- # print(name)
777
- is_continue = False
778
- if is_except:
779
- for item in is_except:
780
- # print(item, f'-----', os.path.join(root, name))
781
- if item in os.path.join(root, name):
782
- # print(name)
783
- is_continue = True
784
- break
785
- if is_continue: # 需要排除不做处理的文件或文件夹
786
- continue
787
-
788
- # print(is_except, is_continue)
789
- def bib(paths, _as_month=None):
790
- """闭包函数"""
791
- self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
792
-
793
- if 'py_xg' not in name: # 排除非目标文件
794
- continue
795
-
796
- if name.endswith('.csv') and '商品排行_' in name:
797
- t_path = os.path.join(self.source_path, '生意参谋', '商品排行')
798
- bib(t_path, _as_month=True)
799
- elif name.endswith('.csv') and '店铺来源_来源构成_' in name:
800
- t_path = os.path.join(self.source_path, '生意参谋', '店铺流量来源')
801
- bib(t_path, _as_month=True)
802
- elif name.endswith('.csv') and (
803
- '商品类目属性' in name or '商品主图视频' in name or '商品sku属性' in name):
804
- t_path = os.path.join(self.source_path, '生意参谋', '商品属性')
805
- bib(t_path, _as_month=True)
806
- elif name.endswith('.csv') and '爱库存_商品榜单_' in name:
807
- t_path = os.path.join(self.source_path, '爱库存', '商品spu榜单')
808
- bib(t_path, _as_month=True)
809
- elif name.endswith('.csv') and '手淘搜索_本店引流词_' in name:
810
- t_path = os.path.join(self.source_path, '生意参谋', '手淘搜索_本店引流词')
811
- bib(t_path, _as_month=True)
812
- elif name.endswith('.csv') and '直播分场次效果_' in name:
813
- t_path = os.path.join(self.source_path, '生意参谋', '直播分场次效果')
814
- bib(t_path, _as_month=True)
815
-
816
- def move_dmp(self, path=None, is_except=[]):
817
- """ 达摩盘 """
818
- if not path:
819
- path = self.path
820
- for root, dirs, files in os.walk(path, topdown=False):
821
- for name in files:
822
- # print(name)
823
- is_continue = False
824
- if is_except:
825
- for item in is_except:
826
- # print(item, f'-----', os.path.join(root, name))
827
- if item in os.path.join(root, name):
828
- # print(name)
829
- is_continue = True
830
- break
831
- if is_continue: # 需要排除不做处理的文件或文件夹
832
- continue
833
-
834
- # print(is_except, is_continue)
835
- def bib(paths, _as_month=None):
836
- """闭包函数"""
837
- self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
838
-
839
- if 'py_xg' not in name: # 排除非目标文件
840
- continue
841
-
842
- if name.endswith('.csv') and '人群属性_万里马官方旗舰店' in name:
843
- t_path = os.path.join(self.source_path, '达摩盘', '我的人群属性')
844
- bib(t_path, _as_month=True)
845
- elif name.endswith('.csv') and 'dmp人群报表_' in name:
846
- t_path = os.path.join(self.source_path, '达摩盘', 'dmp人群报表')
847
- bib(t_path, _as_month=True)
848
- elif name.endswith('.csv') and '货品洞察_全店单品' in name:
849
- t_path = os.path.join(self.source_path, '达摩盘', '货品洞察')
850
- bib(t_path, _as_month=True)
851
-
852
- # @try_except
853
- def move_sjy(self, path=None, is_except=[]):
854
- if not path:
855
- path = self.path
856
- for root, dirs, files in os.walk(path, topdown=False):
857
- for name in files:
858
- # print(name)
859
- is_continue = False
860
- if is_except:
861
- for item in is_except:
862
- # print(item, f'-----', os.path.join(root, name))
863
- if item in os.path.join(root, name):
864
- # print(name)
865
- is_continue = True
866
- break
867
- if is_continue: # 需要排除不做处理的文件或文件夹
868
- continue
869
-
870
- # print(is_except, is_continue)
871
- def bib(paths, _as_month=None):
872
- """闭包函数"""
873
- self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
874
-
875
- if 'py_xg' not in name: # 排除非目标文件
876
- continue
877
-
878
- if name.endswith('.csv') and 'baobei' in name:
879
- t_path = os.path.join(self.source_path, '生意经', '宝贝指标')
880
- bib(t_path, _as_month=True)
881
- elif name.endswith('.csv') and '省份城市分析' in name:
882
- t_path = os.path.join(self.source_path, '生意经', '省份城市分析')
883
- bib(t_path, _as_month=True)
884
- elif name.endswith('.csv') and '店铺销售指标' in name:
885
- t_path = os.path.join(self.source_path, '生意经', '店铺销售指标')
886
- bib(t_path, _as_month=False)
887
- elif name.endswith('.csv') and 'order' in name:
888
- t_path = os.path.join(self.source_path, '生意经', '订单数据')
889
- bib(t_path, _as_month=False)
890
-
891
- # @try_except
892
- def move_jd(self, path=None, is_except=[]):
893
- if not path:
894
- path = self.path
895
- for root, dirs, files in os.walk(path, topdown=False):
896
- for name in files:
897
- # print(name)
898
- is_continue = False
899
- if is_except:
900
- for item in is_except:
901
- # print(item, f'-----', os.path.join(root, name))
902
- if item in os.path.join(root, name):
903
- # print(name)
904
- is_continue = True
905
- break
906
- if is_continue: # 需要排除不做处理的文件或文件夹
907
- continue
908
-
909
- # print(is_except, is_continue)
910
- def bib(paths, _as_month=None):
911
- """闭包函数"""
912
- self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
913
-
914
- if 'py_xg' not in name: # 排除非目标文件
915
- continue
916
-
917
- if name.endswith('.csv') and 'spu_商品明细' in name:
918
- t_path = os.path.join(self.source_path, '京东报表', '京东商智_spu_商品明细')
919
- bib(t_path, _as_month=True)
920
- elif name.endswith('.csv') and 'sku_商品明细' in name:
921
- t_path = os.path.join(self.source_path, '京东报表', '京东商智_sku_商品明细')
922
- bib(t_path, _as_month=True)
923
- elif name.endswith('.csv') and '京东推广_搜索词' in name:
924
- t_path = os.path.join(self.source_path, '京东报表', '搜索词报表')
925
- bib(t_path, _as_month=True)
926
- elif name.endswith('.csv') and '京东推广_点击成交' in name:
927
- t_path = os.path.join(self.source_path, '京东报表', '推广报表')
928
- bib(t_path, _as_month=True)
929
- elif name.endswith('.csv') and '京东推广_关键词点击' in name:
930
- t_path = os.path.join(self.source_path, '京东报表', '关键词报表')
931
- bib(t_path, _as_month=True)
932
- elif name.endswith('.csv') and '店铺来源_三级来源' in name:
933
- t_path = os.path.join(self.source_path, '京东报表', '店铺来源_三级来源')
934
- bib(t_path, _as_month=True)
935
-
936
- # @try_except
937
- def move_tg_tm(self, path=None, is_except=[]):
938
- if not path:
939
- path = self.path
940
- for root, dirs, files in os.walk(path, topdown=False):
941
- for name in files:
942
- # print(name)
943
- is_continue = False
944
- if is_except:
945
- for item in is_except:
946
- # print(item, f'-----', os.path.join(root, name))
947
- if item in os.path.join(root, name):
948
- # print(name)
949
- is_continue = True
950
- break
951
- if is_continue: # 需要排除不做处理的文件或文件夹
952
- continue
953
- # print(is_except, is_continue)
954
- def bib(paths, _as_month=None):
955
- """闭包函数"""
956
- self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
957
-
958
- if 'py_xg' not in name: # 排除非目标文件
959
- continue
960
-
961
- if name.endswith('.csv') and 'tg_report_主体报表_万里马官方旗舰店' in name:
962
- t_path = os.path.join(self.source_path, '天猫推广报表', '主体报表')
963
- bib(t_path, _as_month=True)
964
- elif name.endswith('.csv') and 'tg_report_营销场景报表_万里马官方旗舰店' in name:
965
- t_path = os.path.join(self.source_path, '天猫推广报表', '营销场景报表')
966
- bib(t_path, _as_month=True)
967
- elif name.endswith('.csv') and 'tg_report_人群报表_万里马官方旗舰店' in name:
968
- t_path = os.path.join(self.source_path, '天猫推广报表', '人群报表')
969
- bib(t_path, _as_month=True)
970
- elif name.endswith('.csv') and 'tg_report_权益报表_万里马官方旗舰店' in name:
971
- t_path = os.path.join(self.source_path, '天猫推广报表', '权益报表')
972
- bib(t_path, _as_month=True)
973
- elif name.endswith('.csv') and 'tg_report_计划报表_万里马官方旗舰店' in name:
974
- t_path = os.path.join(self.source_path, '天猫推广报表', '计划报表')
975
- bib(t_path, _as_month=True)
976
- elif name.endswith('.csv') and 'tg_report_关键词报表_万里马官方旗舰店' in name:
977
- t_path = os.path.join(self.source_path, '天猫推广报表', '关键词报表')
978
- bib(t_path, _as_month=True)
979
- elif name.endswith('.csv') and 'tg_report_地域报表_省份_万里马官方旗舰店' in name:
980
- t_path = os.path.join(self.source_path, '天猫推广报表', '地域报表_省份')
981
- bib(t_path, _as_month=True)
982
- elif name.endswith('.csv') and 'tg_report_地域报表_城市_万里马官方旗舰店' in name:
983
- t_path = os.path.join(self.source_path, '天猫推广报表', '地域报表_城市')
984
- bib(t_path, _as_month=True)
985
- elif name.endswith('.csv') and 'tg_report_单元报表_万里马官方旗舰店' in name:
986
- t_path = os.path.join(self.source_path, '天猫推广报表', '单元报表')
987
- bib(t_path, _as_month=True)
988
- elif name.endswith('.csv') and 'tg_report_创意报表_素材粒度_万里马官方旗舰店' in name:
989
- t_path = os.path.join(self.source_path, '天猫推广报表', '创意报表_素材粒度')
990
- bib(t_path, _as_month=True)
991
- elif name.endswith('.csv') and 'tg_report_创意报表_创意粒度_万里马官方旗舰店' in name:
992
- t_path = os.path.join(self.source_path, '天猫推广报表', '创意报表_创意粒度')
993
- bib(t_path, _as_month=True)
994
- elif name.endswith('.csv') and 'tg_report_超级直播报表_人群_万里马官方旗舰店' in name:
995
- t_path = os.path.join(self.source_path, '天猫推广报表', '超级直播报表_人群')
996
- bib(t_path, _as_month=True)
997
- elif name.endswith('.csv') and '超级短视频_主体' in name:
998
- t_path = os.path.join(self.source_path, '天猫推广报表', '超级短视频_主体')
999
- bib(t_path, _as_month=True)
1000
-
1001
- elif name.endswith('.csv') and 'tg_report_品销宝_明星店铺_' in name:
1002
- t_path = os.path.join(self.source_path, '天猫推广报表', '品销宝')
1003
- bib(t_path, _as_month=True)
1004
-
1005
- elif name.endswith('xlsx') and '商品素材_万里马官方旗舰店' in name:
1006
- t_path = os.path.join(self.source_path, '商品素材')
1007
- bib(t_path, _as_month=True)
1008
- elif name.endswith('xlsx') and '商品素材_万里马官方企业店' in name:
1009
- t_path = os.path.join(self.source_path, '商品素材')
1010
- bib(t_path, _as_month=True)
1011
-
1012
- # @try_except
1013
- def move_tg_tb(self, path=None, is_except=[]):
1014
- if not path:
1015
- path = self.path
1016
- for root, dirs, files in os.walk(path, topdown=False):
1017
- for name in files:
1018
- # print(name)
1019
- is_continue = False
1020
- if is_except:
1021
- for item in is_except:
1022
- # print(item, f'-----', os.path.join(root, name))
1023
- if item in os.path.join(root, name):
1024
- # print(name)
1025
- is_continue = True
1026
- break
1027
- if is_continue: # 需要排除不做处理的文件或文件夹
1028
- continue
1029
-
1030
- # print(is_except, is_continue)
1031
- def bib(paths, _as_month=None):
1032
- """闭包函数"""
1033
- self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
1034
-
1035
- if 'py_xg' not in name: # 排除非目标文件
1036
- continue
1037
-
1038
- if name.endswith('.csv') and 'tg_report_主体报表_万里马官方企业店' in name:
1039
- t_path = os.path.join(self.source_path, '淘宝推广报表', '主体报表')
1040
- bib(t_path, _as_month=True)
1041
- elif name.endswith('.csv') and 'tg_report_营销场景报表_万里马官方企业店' in name:
1042
- t_path = os.path.join(self.source_path, '淘宝推广报表', '营销场景报表')
1043
- bib(t_path, _as_month=True)
1044
- elif name.endswith('.csv') and 'tg_report_人群报表_万里马官方企业店' in name:
1045
- t_path = os.path.join(self.source_path, '淘宝推广报表', '人群报表')
1046
- bib(t_path, _as_month=True)
1047
- elif name.endswith('.csv') and 'tg_report_权益报表_万里马官方企业店' in name:
1048
- t_path = os.path.join(self.source_path, '淘宝推广报表', '权益报表')
1049
- bib(t_path, _as_month=True)
1050
- elif name.endswith('.csv') and 'tg_report_计划报表_万里马官方企业店' in name:
1051
- t_path = os.path.join(self.source_path, '淘宝推广报表', '计划报表')
1052
- bib(t_path, _as_month=True)
1053
- elif name.endswith('.csv') and 'tg_report_关键词报表_万里马官方企业店' in name:
1054
- t_path = os.path.join(self.source_path, '淘宝推广报表', '关键词报表')
1055
- bib(t_path, _as_month=True)
1056
- elif name.endswith('.csv') and 'tg_report_地域报表_省份_万里马官方企业店' in name:
1057
- t_path = os.path.join(self.source_path, '淘宝推广报表', '地域报表_省份')
1058
- bib(t_path, _as_month=True)
1059
- elif name.endswith('.csv') and 'tg_report_地域报表_城市_万里马官方企业店' in name:
1060
- t_path = os.path.join(self.source_path, '淘宝推广报表', '地域报表_城市')
1061
- bib(t_path, _as_month=True)
1062
- elif name.endswith('.csv') and 'tg_report_单元报表_万里马官方企业店' in name:
1063
- t_path = os.path.join(self.source_path, '淘宝推广报表', '单元报表')
1064
- bib(t_path, _as_month=True)
1065
- elif name.endswith('.csv') and 'tg_report_创意报表_素材粒度_万里马官方企业店' in name:
1066
- t_path = os.path.join(self.source_path, '淘宝推广报表', '创意报表_素材粒度')
1067
- bib(t_path, _as_month=True)
1068
- elif name.endswith('.csv') and 'tg_report_创意报表_创意粒度_万里马官方企业店' in name:
1069
- t_path = os.path.join(self.source_path, '淘宝推广报表', '创意报表_创意粒度')
1070
- bib(t_path, _as_month=True)
1071
- elif name.endswith('.csv') and 'tg_report_超级直播报表_万里马官方企业店' in name:
1072
- t_path = os.path.join(self.source_path, '淘宝推广报表', '超级直播报表')
1073
- bib(t_path, _as_month=True)
1074
-
1075
- # @try_except
1076
- def new_unzip(self, path=None, is_move=None):
1077
- """
1078
- {解压并移除zip文件}
1079
- 如果是京东的商品明细,处理过程:
1080
- 1. 读取 zip包的文件名
1081
- 2. 组合完整路径,判断文件夹下是否已经有同名文件
1082
- 3. 如果有,则将该同名文件改名,(从文件名中提取日期,重新拼接文件名)
1083
- 4. 然后解压 zip包
1084
- 5. 需要用 _jd_rename 继续重命名刚解压的文件
1085
- is_move 参数, 是否移除 下载目录的所有zip 文件
1086
- """
1087
- if not path:
1088
- path = self.path
1089
- res_names = [] # 需要移除的压缩文件
1090
- for root, dirs, files in os.walk(path, topdown=False):
1091
- for name in files:
1092
- if '~$' in name or 'DS_Store' in name:
1093
- continue
1094
- if name.endswith('.zip'):
1095
- old_file = os.path.join(root, name)
1096
- f = zipfile.ZipFile(old_file, 'r')
1097
- if len(f.namelist()) == 1: # 压缩包只有一个文件的情况
1098
- for zip_name in f.namelist(): # 读取zip内的文件名称
1099
- # zip_name_1 = zip_name.encode('cp437').decode('utf-8')
1100
- try:
1101
- zip_name_1 = zip_name.encode('utf-8').decode('utf-8')
1102
- except:
1103
- zip_name_1 = zip_name.encode('cp437').decode('utf-8')
1104
- new_path = os.path.join(root, zip_name_1) # 拼接解压后的文件路径
1105
- if os.path.isfile(new_path) and '全部渠道_商品明细' in new_path: # 是否存在和包内同名的文件
1106
- # 专门处理京东文件, 已过期可删
1107
- df = pd.read_excel(new_path, engine='xlrd')
1108
- try:
1109
- pattern1 = re.findall(r'\d{8}_(\d{4})(\d{2})(\d{2})_全部渠道_商品明细',
1110
- name)
1111
- pattern2 = re.findall(
1112
- r'\d{8}_(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})_全部渠道_商品明细',
1113
- name)
1114
- if pattern1:
1115
- year_date = '-'.join(list(pattern1[0])) + '_' + '-'.join(list(pattern1[0]))
1116
- elif pattern2:
1117
- year_date = '-'.join(list(pattern2[0])[0:3]) + '_' + '-'.join(
1118
- list(pattern2[0])[3:7])
1119
- else:
1120
- year_date = '无法提取日期'
1121
- print(f'{name} 无法从文件名中提取日期,请检查pattern或文件')
1122
- if ('10035975359247' in df['商品ID'].values or '10056642622343' in
1123
- df['商品ID'].values):
1124
- os.rename(new_path,
1125
- os.path.join(root, 'sku_' + year_date + '_全部渠道_商品明细.xls'))
1126
- f.extract(zip_name_1, root)
1127
- elif ('10021440233518' in df['商品ID'].values or '10022867813485' in
1128
- df['商品ID'].values):
1129
- os.rename(new_path,
1130
- os.path.join(root, 'spu_' + year_date + '_全部渠道_商品明细.xls'))
1131
- f.extract(zip_name_1, root)
1132
- if is_move:
1133
- os.remove(os.path.join(root, name))
1134
- except Exception as e:
1135
- print(e)
1136
- continue
1137
- else:
1138
- f.extract(zip_name, root)
1139
- if zip_name_1 != zip_name:
1140
- os.rename(os.path.join(root, zip_name), os.path.join(root, zip_name_1))
1141
- if is_move:
1142
- res_names.append(name)
1143
- # os.remove(os.path.join(root, name)) # 这里不能移除,会提示文件被占用
1144
- f.close()
1145
- else: # 压缩包内包含多个文件的情况
1146
- f.close()
1147
- self.unzip_all(path=old_file, save_path=path)
1148
-
1149
- if is_move:
1150
- for name in res_names:
1151
- os.remove(os.path.join(path, name))
1152
- print(f'移除{os.path.join(path, name)}')
1153
-
1154
- @staticmethod
1155
- def unzip_all(path, save_path):
1156
- """
1157
- 遍历目录, 重命名有乱码的文件
1158
- 2. 如果压缩包是文件夹, 则保存到新文件夹,并删除有乱码的文件夹
1159
- 3. 删除MAC系统的临时文件夹__MACOSX
1160
- """
1161
- with PyZipFile(path) as _f:
1162
- _f.extractall(save_path)
1163
- _f.close()
1164
- for _root, _dirs, _files in os.walk(save_path, topdown=False):
1165
- for _name in _files:
1166
- if '~$' in _name or 'DS_Store' in _name:
1167
- continue
1168
- try:
1169
- _new_root = _root.encode('cp437').decode('utf-8')
1170
- _new_name = _name.encode('cp437').decode('utf-8')
1171
- except:
1172
- _new_root = _root.encode('utf-8').decode('utf-8')
1173
- _new_name = _name.encode('utf-8').decode('utf-8')
1174
- _old = os.path.join(_root, _name)
1175
- _new = os.path.join(_new_root, _new_name)
1176
- if _new_root != _root: # 目录乱码,创建新目录
1177
- os.makedirs(_new_root, exist_ok=True)
1178
- os.rename(_old, _new)
1179
- try:
1180
- _new_root = _root.encode('cp437').decode('utf-8')
1181
- except:
1182
- _new_root = _root.encode('utf-8').decode('utf-8')
1183
- if _new_root != _root or '__MACOSX' in _root:
1184
- shutil.rmtree(_root)
1185
-
1186
- def upload_df(self, path=None):
1187
- """
1188
- 将清洗后的 df 上传数据库, copysh.py 调用
1189
- """
1190
- df_to_json = df_types.DataTypes() # json 文件, 包含数据的 dtypes 信息
1191
-
1192
- # d = mongo.UploadMongo(
1193
- # username=username,
1194
- # password=password,
1195
- # host=host,
1196
- # port=port,
1197
- # drop_duplicates=False,
1198
- # )
1199
- # for data in self.datas:
1200
- # db_name, collection_name, df = data['数据库名'], data['集合名称'], data['数据主体']
1201
- # df_to_json.get_df_types(
1202
- # df=df,
1203
- # db_name=db_name,
1204
- # collection_name=collection_name,
1205
- # is_file_dtype=True, # 默认本地文件优先: True
1206
- # )
1207
- # d.df_to_mongo(df=df, db_name=db_name, collection_name=collection_name)
1208
- # if d.client:
1209
- # d.client.close()
1210
-
1211
- m = mysql.MysqlUpload(
1212
- username=username,
1213
- password=password,
1214
- host=host,
1215
- port=port,
1216
- )
1217
- for data in self.datas:
1218
- df, db_name, collection_name, rt_filename = data['数据主体'], data['数据库名'], data['集合名称'], data['文件名']
1219
- df_to_json.get_df_types(
1220
- df=df,
1221
- db_name=db_name,
1222
- collection_name=collection_name,
1223
- is_file_dtype=True, # 默认本地文件优先: True
1224
- )
1225
- m.df_to_mysql(
1226
- df=df,
1227
- db_name=db_name,
1228
- table_name=collection_name,
1229
- move_insert=False, # 先删除,再插入,新版有多店数据,不可按日期删除
1230
- df_sql=True, # 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重
1231
- drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
1232
- filename=rt_filename, # 用来追踪处理进度
1233
- service_database=service_database, # 字典
1234
- )
1235
- df_to_json.as_json_file() # 写入 json 文件, 包含数据的 dtypes 信息
1236
-
1237
-
1238
- def date_table():
1239
- """
1240
- 生成 pbix 使用的日期表
1241
- """
1242
- start_date = '2022-01-01' # 日期表的起始日期
1243
- yesterday = time.strftime('%Y-%m-%d', time.localtime(time.time() - 86400))
1244
- dic = pd.date_range(start=start_date, end=yesterday)
1245
- df = pd.DataFrame(dic, columns=['日期'])
1246
- df.sort_values('日期', ascending=True, ignore_index=True, inplace=True)
1247
- df.reset_index(inplace=True)
1248
- # inplace 添加索引到 df
1249
- p = df.pop('index')
1250
- df['月2'] = df['日期']
1251
- df['月2'] = df['月2'].dt.month
1252
- df['日期'] = df['日期'].dt.date # 日期格式保留年月日,去掉时分秒
1253
- df['年'] = df['日期'].apply(lambda x: str(x).split('-')[0] + '年')
1254
- df['月'] = df['月2'].apply(lambda x: str(x) + '月')
1255
- # df.drop('月2', axis=1, inplace=True)
1256
- mon = df.pop('月2')
1257
- df['日'] = df['日期'].apply(lambda x: str(x).split('-')[2])
1258
- df['年月'] = df.apply(lambda x: x['年'] + x['月'], axis=1)
1259
- df['月日'] = df.apply(lambda x: x['月'] + x['日'] + '日', axis=1)
1260
- df['第n周'] = df['日期'].apply(lambda x: x.strftime('第%W周'))
1261
- df['索引'] = p
1262
- df['月索引'] = mon
1263
- df.sort_values('日期', ascending=False, ignore_index=True, inplace=True)
1264
-
1265
- m = mysql.MysqlUpload(
1266
- username=username,
1267
- password=password,
1268
- host=host,
1269
- port=port,
1270
- )
1271
- m.df_to_mysql(
1272
- df=df,
1273
- db_name='聚合数据',
1274
- table_name='日期表',
1275
- move_insert=True, # 先删除,再插入
1276
- df_sql=False, # 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重
1277
- drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
1278
- filename=None, # 用来追踪处理进度
1279
- service_database=service_database, # 用来追踪处理进度
1280
- )
1281
-
1282
-
1283
- def main(is_mysql=False, is_company=False):
1284
- """
1285
- is_mysql: 调试时加,False: 是否后续的聚合数据
1286
- is_company: 公司电脑不需要移动文件到原始文件
1287
- """
1288
-
1289
- cn = DataClean(
1290
- path=upload_path, # 源文件目录,下载文件夹
1291
- source_path=source_path3, # 原始文件保存目录
1292
- )
1293
- cn.new_unzip(is_move=True) # 解压文件, is_move 解压后是否删除原 zip 压缩文件
1294
- cn.sycm_tm(is_except=['except']) # 天猫生意参谋
1295
- cn.dmp_tm(is_except=['except']) # 达摩盘
1296
- cn.tg_reports(is_except=['except']) # 推广报表,天猫淘宝共同清洗
1297
- cn.syj_reports_tm(is_except=['except']) # 天猫生意经
1298
-
1299
- cn.jd_reports(is_except=['except']) # 清洗京东报表
1300
- cn.sp_scene_clean(is_except=['except']) # 商品素材
1301
- cn.upload_df() # 上传数据库
1302
-
1303
- if is_company: # 公司移除所有文件
1304
- files = os.listdir(upload_path)
1305
- for file in files:
1306
- os.remove(os.path.join(upload_path, file))
1307
- else: # 其他主机则进行文件分类
1308
- cn.move_sycm(is_except=['临时文件', ]) # 生意参谋,移到文件到原始文件夹
1309
- cn.move_dmp(is_except=['临时文件', ]) # 达摩盘
1310
- cn.move_sjy(is_except=['临时文件',]) # 生意经,移到文件到原始文件夹
1311
- cn.move_jd(is_except=['临时文件', ]) # 京东,移到文件到原始文件夹
1312
- cn.move_tg_tm(is_except=['临时文件', ]) # 天猫,移到文件到原始文件夹
1313
- cn.move_tg_tb(is_except=['临时文件', ]) # 淘宝店,移到文件到原始文件夹
1314
-
1315
- if not is_mysql:
1316
- return
1317
-
1318
- # 更新日期表
1319
- date_table()
1320
- # 更新货品年份基准表, 属性设置 3 - 货品年份基准
1321
- p = products.Products()
1322
- p.to_mysql()
1323
-
1324
- conf = myconfig.main()
1325
- data = conf['Windows']['xigua_lx']['mysql']['local']
1326
- db_list = conf['Windows']['xigua_lx']['mysql']['数据库集']
1327
- db_list = [item for item in db_list if item != '聚合数据']
1328
- # 清理所有非聚合数据的库
1329
- optimize_data.op_data(
1330
- db_name_lists=db_list,
1331
- days=5,
1332
- is_mongo=True,
1333
- is_mysql=True,
1334
- )
1335
-
1336
- # 数据聚合
1337
- query_data.data_aggregation(months=3)
1338
- time.sleep(60)
1339
-
1340
- # 清理聚合数据, mongodb 中没有聚合数据,所以只需要清理 mysql 即可
1341
- optimize_data.op_data(
1342
- db_name_lists=['聚合数据'],
1343
- days=100,
1344
- is_mongo=False,
1345
- is_mysql=True,
1346
- )
1347
-
1348
-
1349
- if __name__ == '__main__':
1350
- main(is_mysql=False)