mdbq 2.5.4__py3-none-any.whl → 2.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1232 @@
1
+ # -*- coding:utf-8 -*-
2
+ import warnings
3
+ import pandas as pd
4
+ from functools import wraps
5
+ import chardet
6
+ import zipfile
7
+ from pyzipper import PyZipFile
8
+ import os
9
+ import platform
10
+ import pathlib
11
+ import json
12
+ from mdbq.mongo import mongo
13
+ from mdbq.mysql import mysql
14
+ from mdbq.config import get_myconf
15
+ from mdbq.aggregation import df_types
16
+ from mdbq.config import products
17
+ from mdbq.aggregation import optimize_data
18
+ from mdbq.aggregation import query_data
19
+ import datetime
20
+ import time
21
+ import re
22
+ import shutil
23
+ import getpass
24
+
25
+ warnings.filterwarnings('ignore')
26
+
27
+
28
+ if platform.system() == 'Windows':
29
+ # windows版本
30
+ Data_Path = r'C:\同步空间\BaiduSyncdisk'
31
+ D_PATH = os.path.join(f'C:\\Users\\{getpass.getuser()}\\Downloads')
32
+ Share_Path = os.path.join(r'\\192.168.1.198\时尚事业部\01.运营部\天猫报表') # 共享文件根目录
33
+ elif platform.system() == 'Linux':
34
+ Data_Path = '数据中心'
35
+ D_PATH = 'Downloads'
36
+ if not os.path.exists(D_PATH):
37
+ os.makedirs(D_PATH)
38
+ Share_Path = '' # linux 通常是远程服务器,不需要访问共享
39
+ else:
40
+ Data_Path = f'/Users/{getpass.getuser()}/数据中心' # 使用Mac独立网络时
41
+ D_PATH = os.path.join(f'/Users/{getpass.getuser()}/Downloads')
42
+ Share_Path = os.path.join('/Volumes/时尚事业部/01.运营部/天猫报表') # 共享文件根目录
43
+
44
+ upload_path = os.path.join(D_PATH, '数据上传中心') # 此目录位于下载文件夹
45
+ source_path = os.path.join(Data_Path, '原始文件2') # 此目录保存下载并清洗过的文件,作为数据库备份
46
+ source_path3 = os.path.join(Data_Path, '原始文件3') # 此目录保存下载并清洗过的文件,作为数据库备份
47
+
48
+
49
+ class DataClean:
50
+ """ 数据分类 """
51
+
52
+ def __init__(self, path, source_path, service_databases):
53
+ self.path = path # 数据源位置,下载文件夹
54
+ self.source_path = source_path # 原始文件保存目录
55
+ self.datas = []
56
+ self.service_databases = service_databases
57
+
58
+ @staticmethod
59
+ def try_except(func): # 在类内部定义一个异常处理方法
60
+ @wraps(func)
61
+ def wrapper(*args, **kwargs):
62
+ try:
63
+ return func(*args, **kwargs)
64
+ except Exception as e:
65
+ print(f'{func.__name__}, {e}') # 将异常信息返回
66
+
67
+ return wrapper
68
+
69
+ @staticmethod
70
+ def get_encoding(file_path):
71
+ """
72
+ 获取文件的编码方式, 读取速度比较慢,非必要不要使用
73
+ """
74
+ with open(file_path, 'rb') as f:
75
+ f1 = f.read()
76
+ encod = chardet.detect(f1).get('encoding')
77
+ return encod
78
+
79
+ @staticmethod
80
+ def save_to_csv(_df, _save_paths, filenames, encoding='utf-8_sig'):
81
+ if '.csv' not in filenames:
82
+ filenames = f'{filenames}.csv'
83
+ if not os.path.exists(_save_paths):
84
+ os.makedirs(_save_paths, exist_ok=True)
85
+ _df.to_csv(os.path.join(_save_paths, filenames), encoding=encoding, index=False, header=True)
86
+
87
+ def tg_reports(self, path=None, is_except=[]):
88
+ """ 处理天猫淘宝推广类报表 """
89
+ if not path:
90
+ path = self.path
91
+ report_names = [
92
+ {
93
+ '文件简称': 'tg_report_主体报表',
94
+ '数据库名': '推广数据3',
95
+ '集合名称': '主体报表',
96
+ },
97
+ {
98
+ '文件简称': 'tg_report_创意报表_创意',
99
+ '数据库名': '推广数据3',
100
+ '集合名称': '创意报表_创意',
101
+ },
102
+ {
103
+ '文件简称': 'tg_report_创意报表_素材',
104
+ '数据库名': '推广数据3',
105
+ '集合名称': '创意报表_素材',
106
+ },
107
+ {
108
+ '文件简称': 'tg_report_单元报表',
109
+ '数据库名': '推广数据3',
110
+ '集合名称': '单元报表',
111
+ },
112
+ {
113
+ '文件简称': 'tg_report_地域报表_省份',
114
+ '数据库名': '推广数据3',
115
+ '集合名称': '地域报表_省份',
116
+ },
117
+ {
118
+ '文件简称': 'tg_report_地域报表_城市',
119
+ '数据库名': '推广数据3',
120
+ '集合名称': '地域报表_城市',
121
+ },
122
+ {
123
+ '文件简称': 'tg_report_关键词报表',
124
+ '数据库名': '推广数据3',
125
+ '集合名称': '关键词报表',
126
+ },
127
+ {
128
+ '文件简称': 'tg_report_计划报表',
129
+ '数据库名': '推广数据3',
130
+ '集合名称': '计划报表',
131
+ },
132
+ {
133
+ '文件简称': 'tg_report_权益报表',
134
+ '数据库名': '推广数据3',
135
+ '集合名称': '权益报表',
136
+ },
137
+ {
138
+ '文件简称': 'tg_report_人群报表',
139
+ '数据库名': '推广数据3',
140
+ '集合名称': '人群报表',
141
+ },
142
+ {
143
+ '文件简称': 'tg_report_营销场景报表',
144
+ '数据库名': '推广数据3',
145
+ '集合名称': '营销场景报表',
146
+ },
147
+ {
148
+ '文件简称': 'tg_report_超级直播报表_人群',
149
+ '数据库名': '推广数据3',
150
+ '集合名称': '超级直播',
151
+ },
152
+ {
153
+ '文件简称': 'tg_report_品销宝_明星店铺',
154
+ '数据库名': '推广数据3',
155
+ '集合名称': '品销宝',
156
+ }
157
+ ]
158
+ for root, dirs, files in os.walk(path, topdown=False):
159
+ for name in files:
160
+ if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
161
+ continue
162
+ if 'py_xg' in name:
163
+ continue
164
+ is_continue = False
165
+ if is_except:
166
+ for item in is_except:
167
+ if item in os.path.join(root, name):
168
+ # print(name)
169
+ is_continue = True
170
+ break
171
+ if is_continue: # 需要排除不做处理的文件或文件夹
172
+ continue
173
+
174
+ # 这里排除掉非推广类报表
175
+ is_continue = False
176
+ db_name = None
177
+ collection_name = None
178
+ for item in report_names:
179
+ if item['文件简称'] in name:
180
+ db_name = item['数据库名']
181
+ collection_name = item['集合名称']
182
+ is_continue = True
183
+ if not is_continue:
184
+ continue
185
+ # 区分淘宝和天猫的报表
186
+ if '万里马官方旗舰店' in name:
187
+ db_name = f'天猫_{db_name}'
188
+ elif '万里马官方企业店' in name:
189
+ db_name = f'淘宝_{db_name}'
190
+ else:
191
+ print(f'报表名称错误,不属于天猫/淘宝店:{name}')
192
+ continue
193
+
194
+ if name.endswith('.csv'): # 推广类报表
195
+ if '明星店铺' in name: # 明星店铺可能会先释放 csv 文件
196
+ continue
197
+ encoding = self.get_encoding(file_path=os.path.join(root, name))
198
+ shop_name = re.findall(r'_([\u4e00-\u9fffA-Za-z]+店)', name)[0]
199
+ df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
200
+ df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
201
+ df.fillna(0, inplace=True)
202
+ date_min = df["日期"].values.min()
203
+ date_max = df["日期"].values.max()
204
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
205
+ df.insert(loc=1, column='店铺名称', value=shop_name)
206
+ new_name = f'py_xg_{name}'
207
+ self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
208
+ os.remove(os.path.join(root, name))
209
+ elif name.endswith('.xlsx') and '品销宝_明星店铺' in name:
210
+ # 品销宝
211
+ sheets4 = ['账户', '推广计划', '推广单元', '创意', '品牌流量包', '定向人群'] # 品销宝
212
+ file_name4 = os.path.splitext(name)[0] # 明星店铺报表
213
+ for sheet4 in sheets4:
214
+ df = pd.read_excel(os.path.join(root, name), sheet_name=sheet4, header=0, engine='openpyxl')
215
+ if len(df) == 0:
216
+ print(f'{name} 报表数据为空')
217
+ os.remove(os.path.join(root, name))
218
+ continue
219
+ if len(df) < 1:
220
+ print(f'{name} 跳过')
221
+ continue
222
+ else:
223
+ shop_name = re.findall(r'明星店铺_([\u4e00-\u9fffA-Za-z]+店)', name)[0]
224
+ df.insert(loc=1, column='店铺名称', value=shop_name)
225
+ df.insert(loc=2, column='报表类型', value=sheet4)
226
+ df.fillna(0, inplace=True)
227
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore') # 转换日期列
228
+ # min_clm = str(df['日期'].min()).split(' ')[0]
229
+ # max_clm = str(df['日期'].max()).split(' ')[0]
230
+ new_file_name4 = f'{sheet4}_py_xg_{file_name4}.csv'
231
+ # 以sheet名进一步创建子文件夹
232
+ # root_new = os.path.join(self.source_path, '推广报表/品销宝', sheet4)
233
+ self.save_to_csv(df, upload_path, new_file_name4)
234
+ os.remove(os.path.join(root, name))
235
+
236
+ # 将数据传入 self.datas 等待更新进数据库
237
+ if not db_name or not collection_name:
238
+ print(f'db_name/collection_name 不能为空')
239
+ continue
240
+ self.datas.append(
241
+ {
242
+ '数据库名': db_name,
243
+ '集合名称': collection_name,
244
+ '数据主体': df,
245
+ '文件名': name,
246
+ }
247
+ )
248
+
249
+ def syj_reports_tm(self, path=None, is_except=[]):
250
+ """ 生意经报表 """
251
+ if not path:
252
+ path = self.path
253
+ report_names = [
254
+ {
255
+ '文件简称': 'baobei',
256
+ '数据库名': '天猫_生意经3',
257
+ '集合名称': '宝贝指标',
258
+ },
259
+ {
260
+ '文件简称': 'order',
261
+ '数据库名': '天猫_生意经3',
262
+ '集合名称': '订单指标',
263
+ },
264
+ {
265
+ '文件简称': '省份城市分析',
266
+ '数据库名': '天猫_生意经3',
267
+ '集合名称': '省份城市分析',
268
+ },
269
+ {
270
+ '文件简称': '店铺销售指标',
271
+ '数据库名': '天猫_生意经3',
272
+ '集合名称': '店铺销售指标',
273
+ },
274
+ ]
275
+
276
+ for root, dirs, files in os.walk(path, topdown=False):
277
+ for name in files:
278
+ if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
279
+ continue
280
+ if 'py_xg' in name:
281
+ continue
282
+ is_continue = False
283
+ if is_except:
284
+ for item in is_except:
285
+ if item in os.path.join(root, name):
286
+ # print(name)
287
+ is_continue = True
288
+ break
289
+ if is_continue: # 需要排除不做处理的文件或文件夹
290
+ continue
291
+
292
+ # 这里排除掉非目标报表
293
+ is_continue = False
294
+ db_name = None
295
+ collection_name = None
296
+ for item in report_names:
297
+ if item['文件简称'] in name:
298
+ db_name = item['数据库名']
299
+ collection_name = item['集合名称']
300
+ is_continue = True
301
+ if not is_continue:
302
+ continue
303
+
304
+ if name.endswith('.csv') and 'baobei' in name:
305
+ encoding = self.get_encoding(file_path=os.path.join(root, name))
306
+ df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
307
+ pattern = re.findall(r'-(\d{4})(\d{2})(\d{2}).csv', name)[0]
308
+ df['日期'] = '-'.join(pattern)
309
+ df.replace(to_replace=['--'], value='', regex=False, inplace=True)
310
+ new_name = f'py_xg_天猫_baobeitrains_{'-'.join(pattern)}.csv'
311
+ self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
312
+ os.remove(os.path.join(root, name))
313
+ elif name.endswith('.csv') and 'order' in name:
314
+ """ 这里不能使用表格原先的 gb2312, 会报错 """
315
+ # encoding = self.get_encoding(file_path=os.path.join(root, name))
316
+ df = pd.read_csv(os.path.join(root, name), encoding='gb18030', header=0, na_filter=False)
317
+ pattern = re.findall(r'(.*)(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})', name)[0]
318
+ date1 ='-'.join(pattern[1:4])
319
+ date2 = '-'.join(pattern[4:7])
320
+ df.insert(loc=0, column='日期', value=date1)
321
+ df.insert(loc=1, column='数据周期', value=f'{date1}_{date2}')
322
+ df.rename(columns={'宝贝标题': '商品标题', '宝贝链接': '商品链接'}, inplace=True)
323
+ df['颜色编码'] = df['商家编码'].apply(
324
+ lambda x: ''.join(re.findall(r' .*(\d{4})$', str(x))) if x else x)
325
+ new_name = f'py_xg_天猫_order_{date1}_{date2}.csv'
326
+ self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
327
+ os.remove(os.path.join(root, name))
328
+ elif name.endswith('.csv') and '省份城市分析' in name:
329
+ encoding = self.get_encoding(file_path=os.path.join(root, name))
330
+ df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
331
+ pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})\.', name)[0]
332
+ date = '-'.join(pattern[1:])
333
+ new_name = f'py_xg_天猫_{pattern[0]}-{date}.csv'
334
+ df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
335
+ if len(df) == 0:
336
+ print(f'{name} 报表数据为空')
337
+ os.remove(os.path.join(root, name))
338
+ continue
339
+ df['省'] = df['省份'].apply(lambda x: x if ' ├─ ' not in x and ' └─ ' not in x else None)
340
+ df['城市'] = df[['省份', '省']].apply(lambda x: '汇总' if x['省'] else x['省份'], axis=1)
341
+ df['省'].fillna(method='ffill', inplace=True)
342
+ df['城市'].replace(to_replace=[' ├─ | └─ '], value='', regex=True, inplace=True)
343
+ pov = df.pop('省')
344
+ city = df.pop('城市')
345
+ df['省+市'] = df['省份']
346
+ df['省份'] = pov
347
+ df.insert(loc=1, column='城市', value=city)
348
+ df.insert(loc=0, column='日期', value=date)
349
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
350
+ self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
351
+ os.remove(os.path.join(root, name))
352
+ elif name.endswith('.csv') and '店铺销售指标' in name:
353
+ # 生意经, 店铺指标,仅限月数据,实际日指标也可以
354
+ name_st = re.findall(r'(.*)\(分日', name)
355
+ if not name_st:
356
+ print(f'{name} 已转换的表格')
357
+ continue
358
+ encoding = self.get_encoding(file_path=os.path.join(root, name))
359
+ df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
360
+ if len(df) == 0:
361
+ print(f'{name} 报表数据为空')
362
+ os.remove(os.path.join(root, name))
363
+ continue
364
+ df['日期'] = df['日期'].astype(str).apply(
365
+ lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', x)[0]) if x else x)
366
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore') # 转换日期列
367
+ # min_clm = str(df.min()['日期']).split(' ')[0]
368
+ # max_clm = str(df.max()['日期']).split(' ')[0]
369
+ min_clm = str(df['日期'].min()).split(' ')[0]
370
+ max_clm = str(df['日期'].max()).split(' ')[0]
371
+ new_name = f'py_xg_天猫_{name_st[0]}-{min_clm}_{max_clm}.csv' # 保存时将(分日)去掉
372
+ df.replace(to_replace=['--'], value='', regex=False, inplace=True)
373
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
374
+ self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
375
+ os.remove(os.path.join(root, name))
376
+
377
+ # 将数据传入 self.datas 等待更新进数据库
378
+ if not db_name or not collection_name:
379
+ print(f'db_name/collection_name 不能为空')
380
+ continue
381
+ self.datas.append(
382
+ {
383
+ '数据库名': db_name,
384
+ '集合名称': collection_name,
385
+ '数据主体': df,
386
+ '文件名': name,
387
+ }
388
+ )
389
+
390
+ def syj_reports_tb(self, path=None, is_except=[]):
391
+ """ 淘宝店 生意经报表 """
392
+ if not path:
393
+ path = self.path
394
+ report_names = [
395
+ {
396
+ '文件简称': 'baobei',
397
+ '数据库名': '淘宝_生意经3',
398
+ '集合名称': '宝贝指标',
399
+ },
400
+ {
401
+ '文件简称': 'order',
402
+ '数据库名': '淘宝_生意经3',
403
+ '集合名称': '订单指标',
404
+ },
405
+ {
406
+ '文件简称': '省份城市分析',
407
+ '数据库名': '淘宝_生意经3',
408
+ '集合名称': '省份城市分析',
409
+ },
410
+ {
411
+ '文件简称': '店铺销售指标',
412
+ '数据库名': '淘宝_生意经3',
413
+ '集合名称': '店铺销售指标',
414
+ },
415
+ ]
416
+
417
+ for root, dirs, files in os.walk(path, topdown=False):
418
+ for name in files:
419
+ if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
420
+ continue
421
+ if 'py_xg' in name:
422
+ continue
423
+ is_continue = False
424
+ if is_except:
425
+ for item in is_except:
426
+ if item in os.path.join(root, name):
427
+ # print(name)
428
+ is_continue = True
429
+ break
430
+ if is_continue: # 需要排除不做处理的文件或文件夹
431
+ continue
432
+
433
+ # 这里排除掉非目标报表
434
+ is_continue = False
435
+ db_name = None
436
+ collection_name = None
437
+ for item in report_names:
438
+ if item['文件简称'] in name:
439
+ db_name = item['数据库名']
440
+ collection_name = item['集合名称']
441
+ is_continue = True
442
+ if not is_continue:
443
+ continue
444
+
445
+ if name.endswith('.csv') and 'baobei' in name:
446
+ encoding = self.get_encoding(file_path=os.path.join(root, name))
447
+ df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
448
+ pattern = re.findall(r'-(\d{4})(\d{2})(\d{2}).csv', name)[0]
449
+ df['日期'] = '-'.join(pattern)
450
+ df.replace(to_replace=['--'], value='', regex=False, inplace=True)
451
+ new_name = f'py_xg_淘宝_baobeitrains_{'-'.join(pattern)}.csv'
452
+ self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
453
+ os.remove(os.path.join(root, name))
454
+ elif name.endswith('.csv') and 'order' in name:
455
+ """ 这里不能使用表格原先的 gb2312, 会报错 """
456
+ # encoding = self.get_encoding(file_path=os.path.join(root, name))
457
+ df = pd.read_csv(os.path.join(root, name), encoding='gb18030', header=0, na_filter=False)
458
+ pattern = re.findall(r'(.*)(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})', name)[0]
459
+ date1 ='-'.join(pattern[1:4])
460
+ date2 = '-'.join(pattern[4:7])
461
+ df.insert(loc=0, column='日期', value=date1)
462
+ df.insert(loc=1, column='数据周期', value=f'{date1}_{date2}')
463
+ df.rename(columns={'宝贝标题': '商品标题', '宝贝链接': '商品链接'}, inplace=True)
464
+ df['颜色编码'] = df['商家编码'].apply(
465
+ lambda x: ''.join(re.findall(r' .*(\d{4})$', str(x))) if x else x)
466
+ new_name = f'py_xg_淘宝_order_{date1}_{date2}.csv'
467
+ self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
468
+ os.remove(os.path.join(root, name))
469
+ elif name.endswith('.csv') and '省份城市分析' in name:
470
+ encoding = self.get_encoding(file_path=os.path.join(root, name))
471
+ df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
472
+ pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})\.', name)[0]
473
+ date = '-'.join(pattern[1:])
474
+ new_name = f'py_xg_淘宝_{pattern[0]}-{date}.csv'
475
+ df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
476
+ if len(df) == 0:
477
+ print(f'{name} 报表数据为空')
478
+ os.remove(os.path.join(root, name))
479
+ continue
480
+ df['省'] = df['省份'].apply(lambda x: x if ' ├─ ' not in x and ' └─ ' not in x else None)
481
+ df['城市'] = df[['省份', '省']].apply(lambda x: '汇总' if x['省'] else x['省份'], axis=1)
482
+ df['省'].fillna(method='ffill', inplace=True)
483
+ df['城市'].replace(to_replace=[' ├─ | └─ '], value='', regex=True, inplace=True)
484
+ pov = df.pop('省')
485
+ city = df.pop('城市')
486
+ df['省+市'] = df['省份']
487
+ df['省份'] = pov
488
+ df.insert(loc=1, column='城市', value=city)
489
+ df.insert(loc=0, column='日期', value=date)
490
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
491
+ self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
492
+ os.remove(os.path.join(root, name))
493
+ elif name.endswith('.csv') and '店铺销售指标' in name:
494
+ # 生意经, 店铺指标,仅限月数据,实际日指标也可以
495
+ name_st = re.findall(r'(.*)\(分日', name)
496
+ if not name_st:
497
+ print(f'{name} 已转换的表格')
498
+ continue
499
+ encoding = self.get_encoding(file_path=os.path.join(root, name))
500
+ df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
501
+ if len(df) == 0:
502
+ print(f'{name} 报表数据为空')
503
+ os.remove(os.path.join(root, name))
504
+ continue
505
+ df['日期'] = df['日期'].astype(str).apply(
506
+ lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', x)[0]) if x else x)
507
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore') # 转换日期列
508
+ # min_clm = str(df.min()['日期']).split(' ')[0]
509
+ # max_clm = str(df.max()['日期']).split(' ')[0]
510
+ min_clm = str(df['日期'].min()).split(' ')[0]
511
+ max_clm = str(df['日期'].max()).split(' ')[0]
512
+ new_name = f'py_xg_淘宝_{name_st[0]}-{min_clm}_{max_clm}.csv' # 保存时将(分日)去掉
513
+ df.replace(to_replace=['--'], value='', regex=False, inplace=True)
514
+ df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
515
+ self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
516
+ os.remove(os.path.join(root, name))
517
+
518
+ # 将数据传入 self.datas 等待更新进数据库
519
+ if not db_name or not collection_name:
520
+ print(f'db_name/collection_name 不能为空')
521
+ continue
522
+ self.datas.append(
523
+ {
524
+ '数据库名': db_name,
525
+ '集合名称': collection_name,
526
+ '数据主体': df,
527
+ '文件名': name,
528
+ }
529
+ )
530
+
531
+ def jd_reports(self, path=None, is_except=[]):
532
+ """ 处理京东报表 """
533
+ if not path:
534
+ path = self.path
535
+ report_names = [
536
+ {
537
+ '文件简称': '京东推广_点击成交',
538
+ '数据库名': '京东数据3',
539
+ '集合名称': '推广数据_京准通',
540
+ },
541
+ {
542
+ '文件简称': '京东推广_搜索词',
543
+ '数据库名': '京东数据3',
544
+ '集合名称': '推广数据_搜索词报表',
545
+ },
546
+ {
547
+ '文件简称': '京东推广_关键词',
548
+ '数据库名': '京东数据3',
549
+ '集合名称': '推广数据_关键词报表',
550
+ },
551
+ {
552
+ '文件简称': '京东商智_sku_商品明细',
553
+ '数据库名': '京东数据3',
554
+ '集合名称': '京东商智_sku_商品明细',
555
+ },
556
+ {
557
+ '文件简称': '京东商智_spu_商品明细',
558
+ '数据库名': '京东数据3',
559
+ '集合名称': '京东商智_spu_商品明细',
560
+ },
561
+ {
562
+ '文件简称': '京东商智_店铺来源_三级来源',
563
+ '数据库名': '京东数据3',
564
+ '集合名称': '京东商智_店铺来源',
565
+ },
566
+ ]
567
+
568
+ for root, dirs, files in os.walk(path, topdown=False):
569
+ for name in files:
570
+ if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
571
+ continue
572
+ if 'py_xg' in name:
573
+ continue
574
+ is_continue = False
575
+ if is_except:
576
+ for item in is_except:
577
+ if item in os.path.join(root, name):
578
+ # print(name)
579
+ is_continue = True
580
+ break
581
+ if is_continue: # 需要排除不做处理的文件或文件夹
582
+ continue
583
+
584
+ # 这里排除掉非目标报表
585
+ is_continue = False
586
+ db_name = None
587
+ collection_name = None
588
+ for item in report_names:
589
+ if item['文件简称'] in name:
590
+ db_name = item['数据库名']
591
+ collection_name = item['集合名称']
592
+ is_continue = True
593
+ if not is_continue:
594
+ continue
595
+
596
+ if name.endswith('.xlsx') and '京东推广_' in name:
597
+ df = pd.read_excel(os.path.join(root, name), header=0)
598
+ new_name = f'py_xg_{name}'
599
+ os.rename(os.path.join(root, name), os.path.join(root, new_name))
600
+ elif name.endswith('.xlsx') and '京东商智_sku_商品明细' in name:
601
+ df = pd.read_excel(os.path.join(root, name), header=0)
602
+ df.replace(to_replace=['-'], value='', regex=False, inplace=True)
603
+ pattern = re.findall(r'_(\d{4}-\d{2}-\d{2})', name)[0]
604
+ df.insert(loc=0, column='日期', value=pattern)
605
+ df.insert(loc=1, column='店铺名称', value='京东箱包旗舰店')
606
+ df.fillna(0, inplace=True)
607
+ new_name = f'py_xg_{name}'
608
+ df.to_excel(os.path.join(upload_path, new_name),
609
+ index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
610
+ os.remove(os.path.join(root, name))
611
+ elif name.endswith('.xlsx') and '京东商智_spu_商品明细' in name:
612
+ df = pd.read_excel(os.path.join(root, name), header=0)
613
+ df.replace(to_replace=['-'], value='', regex=False, inplace=True)
614
+ pattern = re.findall(r'_(\d{4}-\d{2}-\d{2})', name)[0]
615
+ df.insert(loc=0, column='日期', value=pattern)
616
+ df.insert(loc=1, column='店铺名称', value='京东箱包旗舰店')
617
+ df.fillna(0, inplace=True)
618
+ new_name = f'py_xg_{name}'
619
+ df.to_excel(os.path.join(upload_path, new_name),
620
+ index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
621
+ os.remove(os.path.join(root, name))
622
+ elif name.endswith('.xlsx') and '京东商智_店铺来源_三级来源' in name:
623
+ df = pd.read_excel(os.path.join(root, name), header=0)
624
+ df.replace(to_replace=['-'], value='', regex=False, inplace=True)
625
+ df.rename(columns={'时间': '日期'}, inplace=True)
626
+ for col in df.columns.tolist():
627
+ if '环比' in col or '同比' in col:
628
+ df.drop(col, axis=1, inplace=True)
629
+ df.fillna(0, inplace=True)
630
+ new_name = f'py_xg_{name}'
631
+ df.to_excel(os.path.join(upload_path, new_name),
632
+ index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
633
+ os.remove(os.path.join(root, name))
634
+
635
+ # 将数据传入 self.datas 等待更新进数据库
636
+ if not db_name or not collection_name:
637
+ print(f'db_name/collection_name 不能为空')
638
+ continue
639
+ # print(name)
640
+ self.datas.append(
641
+ {
642
+ '数据库名': db_name,
643
+ '集合名称': collection_name,
644
+ '数据主体': df,
645
+ '文件名': name,
646
+ }
647
+ )
648
+
649
+ def sp_scene_clean(self, path=None, is_except=[]):
650
+ if not path:
651
+ path = self.path
652
+ for root, dirs, files in os.walk(path, topdown=False):
653
+ for name in files:
654
+ if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
655
+ continue
656
+ if 'py_xg' in name:
657
+ continue
658
+ is_continue = False
659
+ if is_except:
660
+ for item in is_except:
661
+ if item in os.path.join(root, name):
662
+ # print(name)
663
+ is_continue = True
664
+ break
665
+ if is_continue: # 需要排除不做处理的文件或文件夹
666
+ continue
667
+
668
+ if name.endswith('.xlsx') and '商品素材_' in name:
669
+ shop_name = re.findall(r'_([\u4e00-\u9fffA-Za-z]+店)_', name)[0]
670
+ df = pd.read_excel(os.path.join(root, name), header=0)
671
+ df.insert(loc=1, column='店铺名称', value=shop_name)
672
+ new_name = f'py_xg_{name}'
673
+ df.to_excel(os.path.join(upload_path, new_name),
674
+ index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
675
+ if '官方旗舰店' in name:
676
+ db_name = '属性设置3'
677
+ collection_name = '商品素材_天猫'
678
+ elif '官方企业店' in name:
679
+ db_name = '属性设置3'
680
+ collection_name = '商品素材_淘宝'
681
+ os.remove(os.path.join(root, name))
682
+
683
+ # 将数据传入 self.datas 等待更新进数据库
684
+ if not db_name or not collection_name:
685
+ print(f'db_name/collection_name 不能为空')
686
+ continue
687
+ self.datas.append(
688
+ {
689
+ '数据库名': db_name,
690
+ '集合名称': collection_name,
691
+ '数据主体': df,
692
+ '文件名': name,
693
+ }
694
+ )
695
+ """
696
+ {文件分类}
697
+ 将已处理完的文件 分类移到原始文件夹下
698
+ 此处t_path参数定义了子文件夹的生成名称
699
+ """
700
+
701
+ @staticmethod
702
+ def move_files(path, _name, target_path, _as_month=None):
703
+ """
704
+ name: 移动的文件名,
705
+ target_path: 目标位置
706
+ """
707
+ t2 = target_path # t2 赋值有用, 不能省略
708
+ if not os.path.exists(t2): # 如果目录不存在则创建
709
+ os.makedirs(t2, exist_ok=True)
710
+ if _as_month:
711
+ _date = re.findall(r'(\d{4}-\d{2})-\d{2}', str(_name))
712
+ if _date:
713
+ _date = _date[0]
714
+ t2 = pathlib.Path(t2, _date) # 添加 年月分类
715
+ if not os.path.exists(t2):
716
+ os.makedirs(t2, exist_ok=True)
717
+ old_file = os.path.join(t2, _name) # 检查目标位置是否已经存在该文件
718
+ if os.path.isfile(old_file):
719
+ os.remove(old_file) # 如果存在则移除
720
+ shutil.move(os.path.join(path, _name), t2) # 将文件从下载文件夹移到目标位置
721
+
722
+ # @try_except
723
+ def move_sjy(self, path=None, is_except=[]):
724
+ if not path:
725
+ path = self.path
726
+ for root, dirs, files in os.walk(path, topdown=False):
727
+ for name in files:
728
+ # print(name)
729
+ is_continue = False
730
+ if is_except:
731
+ for item in is_except:
732
+ # print(item, f'-----', os.path.join(root, name))
733
+ if item in os.path.join(root, name):
734
+ # print(name)
735
+ is_continue = True
736
+ break
737
+ if is_continue: # 需要排除不做处理的文件或文件夹
738
+ continue
739
+
740
+ # print(is_except, is_continue)
741
+ def bib(paths, _as_month=None):
742
+ """闭包函数"""
743
+ self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
744
+
745
+ if 'py_xg' not in name: # 排除非目标文件
746
+ continue
747
+
748
+ if '天猫' in name and name.endswith('.csv') and 'baobei' in name:
749
+ t_path = os.path.join(self.source_path, '天猫_生意经', '宝贝指标')
750
+ bib(t_path, _as_month=True)
751
+ elif '天猫' in name and name.endswith('.csv') and '省份城市分析' in name:
752
+ t_path = os.path.join(self.source_path, '天猫_生意经', '省份城市分析')
753
+ bib(t_path, _as_month=True)
754
+ elif '天猫' in name and name.endswith('.csv') and '店铺销售指标' in name:
755
+ t_path = os.path.join(self.source_path, '天猫_生意经', '店铺销售指标')
756
+ bib(t_path, _as_month=False)
757
+ elif '天猫' in name and name.endswith('.csv') and 'order' in name:
758
+ t_path = os.path.join(self.source_path, '天猫_生意经', '订单数据')
759
+ bib(t_path, _as_month=False)
760
+ elif '淘宝' in name or '企业店' in name and name.endswith('.csv') and 'baobei' in name:
761
+ t_path = os.path.join(self.source_path, '淘宝_生意经', '宝贝指标')
762
+ bib(t_path, _as_month=True)
763
+ elif '淘宝' in name or '企业店' in name and name.endswith('.csv') and '省份城市分析' in name:
764
+ t_path = os.path.join(self.source_path, '淘宝_生意经', '省份城市分析')
765
+ bib(t_path, _as_month=True)
766
+ elif '淘宝' in name or '企业店' in name and name.endswith('.csv') and '店铺销售指标' in name:
767
+ t_path = os.path.join(self.source_path, '淘宝_生意经', '店铺销售指标')
768
+ bib(t_path, _as_month=False)
769
+ elif '淘宝' in name or '企业店' in name and name.endswith('.csv') and 'order' in name:
770
+ t_path = os.path.join(self.source_path, '淘宝_生意经', '订单数据')
771
+ bib(t_path, _as_month=False)
772
+
773
+ # @try_except
774
+ def move_jd(self, path=None, is_except=[]):
775
+ if not path:
776
+ path = self.path
777
+ for root, dirs, files in os.walk(path, topdown=False):
778
+ for name in files:
779
+ # print(name)
780
+ is_continue = False
781
+ if is_except:
782
+ for item in is_except:
783
+ # print(item, f'-----', os.path.join(root, name))
784
+ if item in os.path.join(root, name):
785
+ # print(name)
786
+ is_continue = True
787
+ break
788
+ if is_continue: # 需要排除不做处理的文件或文件夹
789
+ continue
790
+
791
+ # print(is_except, is_continue)
792
+ def bib(paths, _as_month=None):
793
+ """闭包函数"""
794
+ self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
795
+
796
+ if 'py_xg' not in name: # 排除非目标文件
797
+ continue
798
+
799
+ if name.endswith('.xlsx') and '京东商智_spu_商品明细' in name:
800
+ t_path = os.path.join(self.source_path, '京东报表', 'spu_商品明细')
801
+ bib(t_path, _as_month=True)
802
+ elif name.endswith('.xlsx') and '京东商智_sku_商品明细' in name:
803
+ t_path = os.path.join(self.source_path, '京东报表', 'sku_商品明细')
804
+ bib(t_path, _as_month=True)
805
+ elif name.endswith('.xlsx') and '京东推广_搜索词' in name:
806
+ t_path = os.path.join(self.source_path, '京东报表', '搜索词报表')
807
+ bib(t_path, _as_month=True)
808
+ elif name.endswith('.xlsx') and '京东推广_点击成交' in name:
809
+ t_path = os.path.join(self.source_path, '京东报表', '推广报表')
810
+ bib(t_path, _as_month=True)
811
+ elif name.endswith('.xlsx') and '京东推广_关键词点击' in name:
812
+ t_path = os.path.join(self.source_path, '京东报表', '关键词报表')
813
+ bib(t_path, _as_month=True)
814
+ elif name.endswith('.xlsx') and '京东商智_店铺来源_三级来源' in name:
815
+ t_path = os.path.join(self.source_path, '京东报表', '店铺来源_三级来源')
816
+ bib(t_path, _as_month=True)
817
+
818
+ # @try_except
819
+ def move_tg_tm(self, path=None, is_except=[]):
820
+ if not path:
821
+ path = self.path
822
+ for root, dirs, files in os.walk(path, topdown=False):
823
+ for name in files:
824
+ # print(name)
825
+ is_continue = False
826
+ if is_except:
827
+ for item in is_except:
828
+ # print(item, f'-----', os.path.join(root, name))
829
+ if item in os.path.join(root, name):
830
+ # print(name)
831
+ is_continue = True
832
+ break
833
+ if is_continue: # 需要排除不做处理的文件或文件夹
834
+ continue
835
+ # print(is_except, is_continue)
836
+ def bib(paths, _as_month=None):
837
+ """闭包函数"""
838
+ self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
839
+
840
+ if 'py_xg' not in name: # 排除非目标文件
841
+ continue
842
+
843
+ if name.endswith('.csv') and 'tg_report_主体报表_万里马官方旗舰店' in name:
844
+ t_path = os.path.join(self.source_path, '天猫推广报表', '主体报表')
845
+ bib(t_path, _as_month=True)
846
+ elif name.endswith('.csv') and 'tg_report_营销场景报表_万里马官方旗舰店' in name:
847
+ t_path = os.path.join(self.source_path, '天猫推广报表', '营销场景报表')
848
+ bib(t_path, _as_month=True)
849
+ elif name.endswith('.csv') and 'tg_report_人群报表_万里马官方旗舰店' in name:
850
+ t_path = os.path.join(self.source_path, '天猫推广报表', '人群报表')
851
+ bib(t_path, _as_month=True)
852
+ elif name.endswith('.csv') and 'tg_report_权益报表_万里马官方旗舰店' in name:
853
+ t_path = os.path.join(self.source_path, '天猫推广报表', '权益报表')
854
+ bib(t_path, _as_month=True)
855
+ elif name.endswith('.csv') and 'tg_report_计划报表_万里马官方旗舰店' in name:
856
+ t_path = os.path.join(self.source_path, '天猫推广报表', '计划报表')
857
+ bib(t_path, _as_month=True)
858
+ elif name.endswith('.csv') and 'tg_report_关键词报表_万里马官方旗舰店' in name:
859
+ t_path = os.path.join(self.source_path, '天猫推广报表', '关键词报表')
860
+ bib(t_path, _as_month=True)
861
+ elif name.endswith('.csv') and 'tg_report_地域报表_省份_万里马官方旗舰店' in name:
862
+ t_path = os.path.join(self.source_path, '天猫推广报表', '地域报表_省份')
863
+ bib(t_path, _as_month=True)
864
+ elif name.endswith('.csv') and 'tg_report_地域报表_城市_万里马官方旗舰店' in name:
865
+ t_path = os.path.join(self.source_path, '天猫推广报表', '地域报表_城市')
866
+ bib(t_path, _as_month=True)
867
+ elif name.endswith('.csv') and 'tg_report_单元报表_万里马官方旗舰店' in name:
868
+ t_path = os.path.join(self.source_path, '天猫推广报表', '单元报表')
869
+ bib(t_path, _as_month=True)
870
+ elif name.endswith('.csv') and 'tg_report_创意报表_素材粒度_万里马官方旗舰店' in name:
871
+ t_path = os.path.join(self.source_path, '天猫推广报表', '创意报表_素材粒度')
872
+ bib(t_path, _as_month=True)
873
+ elif name.endswith('.csv') and 'tg_report_创意报表_创意粒度_万里马官方旗舰店' in name:
874
+ t_path = os.path.join(self.source_path, '天猫推广报表', '创意报表_创意粒度')
875
+ bib(t_path, _as_month=True)
876
+ elif name.endswith('.csv') and 'tg_report_超级直播报表_人群_万里马官方旗舰店' in name:
877
+ t_path = os.path.join(self.source_path, '天猫推广报表', '超级直播报表_人群')
878
+ bib(t_path, _as_month=True)
879
+
880
+ elif name.endswith('.csv') and 'tg_report_品销宝_明星店铺_万里马官方旗舰店' in name:
881
+ if '账户' in name:
882
+ t_path = os.path.join(self.source_path, '天猫推广报表', '品销宝', '账户报表')
883
+ bib(t_path, _as_month=True)
884
+ elif '推广计划' in name:
885
+ t_path = os.path.join(self.source_path, '天猫推广报表', '品销宝', '推广计划报表')
886
+ bib(t_path, _as_month=True)
887
+ elif '推广单元' in name:
888
+ t_path = os.path.join(self.source_path, '天猫推广报表', '品销宝', '推广单元报表')
889
+ bib(t_path, _as_month=True)
890
+ elif '创意' in name:
891
+ t_path = os.path.join(self.source_path, '天猫推广报表', '品销宝', '创意报表')
892
+ bib(t_path, _as_month=True)
893
+ elif '品牌流量包' in name:
894
+ t_path = os.path.join(self.source_path, '天猫推广报表', '品销宝', '品牌流量包报表')
895
+ bib(t_path, _as_month=True)
896
+ elif '定向人群' in name:
897
+ t_path = os.path.join(self.source_path, '天猫推广报表', '品销宝', '定向人群报表')
898
+ bib(t_path, _as_month=True)
899
+ elif name.endswith('xlsx') and '商品素材_万里马官方旗舰店' in name:
900
+ t_path = os.path.join(self.source_path, '商品素材', '天猫')
901
+ bib(t_path, _as_month=True)
902
+ elif name.endswith('xlsx') and '商品素材_万里马官方企业店' in name:
903
+ t_path = os.path.join(self.source_path, '商品素材', '淘宝')
904
+ bib(t_path, _as_month=True)
905
+
906
+ # @try_except
907
+ def move_tg_tb(self, path=None, is_except=[]):
908
+ if not path:
909
+ path = self.path
910
+ for root, dirs, files in os.walk(path, topdown=False):
911
+ for name in files:
912
+ # print(name)
913
+ is_continue = False
914
+ if is_except:
915
+ for item in is_except:
916
+ # print(item, f'-----', os.path.join(root, name))
917
+ if item in os.path.join(root, name):
918
+ # print(name)
919
+ is_continue = True
920
+ break
921
+ if is_continue: # 需要排除不做处理的文件或文件夹
922
+ continue
923
+
924
+ # print(is_except, is_continue)
925
+ def bib(paths, _as_month=None):
926
+ """闭包函数"""
927
+ self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
928
+
929
+ if 'py_xg' not in name: # 排除非目标文件
930
+ continue
931
+
932
+ if name.endswith('.csv') and 'tg_report_主体报表_万里马官方企业店' in name:
933
+ t_path = os.path.join(self.source_path, '淘宝推广报表', '主体报表')
934
+ bib(t_path, _as_month=True)
935
+ elif name.endswith('.csv') and 'tg_report_营销场景报表_万里马官方企业店' in name:
936
+ t_path = os.path.join(self.source_path, '淘宝推广报表', '营销场景报表')
937
+ bib(t_path, _as_month=True)
938
+ elif name.endswith('.csv') and 'tg_report_人群报表_万里马官方企业店' in name:
939
+ t_path = os.path.join(self.source_path, '淘宝推广报表', '人群报表')
940
+ bib(t_path, _as_month=True)
941
+ elif name.endswith('.csv') and 'tg_report_权益报表_万里马官方企业店' in name:
942
+ t_path = os.path.join(self.source_path, '淘宝推广报表', '权益报表')
943
+ bib(t_path, _as_month=True)
944
+ elif name.endswith('.csv') and 'tg_report_计划报表_万里马官方企业店' in name:
945
+ t_path = os.path.join(self.source_path, '淘宝推广报表', '计划报表')
946
+ bib(t_path, _as_month=True)
947
+ elif name.endswith('.csv') and 'tg_report_关键词报表_万里马官方企业店' in name:
948
+ t_path = os.path.join(self.source_path, '淘宝推广报表', '关键词报表')
949
+ bib(t_path, _as_month=True)
950
+ elif name.endswith('.csv') and 'tg_report_地域报表_省份_万里马官方企业店' in name:
951
+ t_path = os.path.join(self.source_path, '淘宝推广报表', '地域报表_省份')
952
+ bib(t_path, _as_month=True)
953
+ elif name.endswith('.csv') and 'tg_report_地域报表_城市_万里马官方企业店' in name:
954
+ t_path = os.path.join(self.source_path, '淘宝推广报表', '地域报表_城市')
955
+ bib(t_path, _as_month=True)
956
+ elif name.endswith('.csv') and 'tg_report_单元报表_万里马官方企业店' in name:
957
+ t_path = os.path.join(self.source_path, '淘宝推广报表', '单元报表')
958
+ bib(t_path, _as_month=True)
959
+ elif name.endswith('.csv') and 'tg_report_创意报表_素材粒度_万里马官方企业店' in name:
960
+ t_path = os.path.join(self.source_path, '淘宝推广报表', '创意报表_素材粒度')
961
+ bib(t_path, _as_month=True)
962
+ elif name.endswith('.csv') and 'tg_report_创意报表_创意粒度_万里马官方企业店' in name:
963
+ t_path = os.path.join(self.source_path, '淘宝推广报表', '创意报表_创意粒度')
964
+ bib(t_path, _as_month=True)
965
+ elif name.endswith('.csv') and 'tg_report_超级直播报表_万里马官方企业店' in name:
966
+ t_path = os.path.join(self.source_path, '淘宝推广报表', '超级直播报表')
967
+ bib(t_path, _as_month=True)
968
+
969
+ # @try_except
970
+ def new_unzip(self, path=None, is_move=None):
971
+ """
972
+ {解压并移除zip文件}
973
+ 如果是京东的商品明细,处理过程:
974
+ 1. 读取 zip包的文件名
975
+ 2. 组合完整路径,判断文件夹下是否已经有同名文件
976
+ 3. 如果有,则将该同名文件改名,(从文件名中提取日期,重新拼接文件名)
977
+ 4. 然后解压 zip包
978
+ 5. 需要用 _jd_rename 继续重命名刚解压的文件
979
+ is_move 参数, 是否移除 下载目录的所有zip 文件
980
+ """
981
+ if not path:
982
+ path = self.path
983
+ res_names = [] # 需要移除的压缩文件
984
+ for root, dirs, files in os.walk(path, topdown=False):
985
+ for name in files:
986
+ if '~$' in name or 'DS_Store' in name:
987
+ continue
988
+ if name.endswith('.zip'):
989
+ old_file = os.path.join(root, name)
990
+ f = zipfile.ZipFile(old_file, 'r')
991
+ if len(f.namelist()) == 1: # 压缩包只有一个文件的情况
992
+ for zip_name in f.namelist(): # 读取zip内的文件名称
993
+ # zip_name_1 = zip_name.encode('cp437').decode('utf-8')
994
+ try:
995
+ zip_name_1 = zip_name.encode('utf-8').decode('utf-8')
996
+ except:
997
+ zip_name_1 = zip_name.encode('cp437').decode('utf-8')
998
+ new_path = os.path.join(root, zip_name_1) # 拼接解压后的文件路径
999
+ if os.path.isfile(new_path) and '全部渠道_商品明细' in new_path: # 是否存在和包内同名的文件
1000
+ # 专门处理京东文件, 已过期可删
1001
+ df = pd.read_excel(new_path)
1002
+ try:
1003
+ pattern1 = re.findall(r'\d{8}_(\d{4})(\d{2})(\d{2})_全部渠道_商品明细',
1004
+ name)
1005
+ pattern2 = re.findall(
1006
+ r'\d{8}_(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})_全部渠道_商品明细',
1007
+ name)
1008
+ if pattern1:
1009
+ year_date = '-'.join(list(pattern1[0])) + '_' + '-'.join(list(pattern1[0]))
1010
+ elif pattern2:
1011
+ year_date = '-'.join(list(pattern2[0])[0:3]) + '_' + '-'.join(
1012
+ list(pattern2[0])[3:7])
1013
+ else:
1014
+ year_date = '无法提取日期'
1015
+ print(f'{name} 无法从文件名中提取日期,请检查pattern或文件')
1016
+ if ('10035975359247' in df['商品ID'].values or '10056642622343' in
1017
+ df['商品ID'].values):
1018
+ os.rename(new_path,
1019
+ os.path.join(root, 'sku_' + year_date + '_全部渠道_商品明细.xls'))
1020
+ f.extract(zip_name_1, root)
1021
+ elif ('10021440233518' in df['商品ID'].values or '10022867813485' in
1022
+ df['商品ID'].values):
1023
+ os.rename(new_path,
1024
+ os.path.join(root, 'spu_' + year_date + '_全部渠道_商品明细.xls'))
1025
+ f.extract(zip_name_1, root)
1026
+ if is_move:
1027
+ os.remove(os.path.join(root, name))
1028
+ except Exception as e:
1029
+ print(e)
1030
+ continue
1031
+ else:
1032
+ f.extract(zip_name, root)
1033
+ if zip_name_1 != zip_name:
1034
+ os.rename(os.path.join(root, zip_name), os.path.join(root, zip_name_1))
1035
+ if is_move:
1036
+ res_names.append(name)
1037
+ # os.remove(os.path.join(root, name)) # 这里不能移除,会提示文件被占用
1038
+ f.close()
1039
+ else: # 压缩包内包含多个文件的情况
1040
+ f.close()
1041
+ self.unzip_all(path=old_file, save_path=path)
1042
+
1043
+ if is_move:
1044
+ for name in res_names:
1045
+ os.remove(os.path.join(path, name))
1046
+ print(f'移除{os.path.join(path, name)}')
1047
+
1048
+ @staticmethod
1049
+ def unzip_all(path, save_path):
1050
+ """
1051
+ 遍历目录, 重命名有乱码的文件
1052
+ 2. 如果压缩包是文件夹, 则保存到新文件夹,并删除有乱码的文件夹
1053
+ 3. 删除MAC系统的临时文件夹__MACOSX
1054
+ """
1055
+ with PyZipFile(path) as _f:
1056
+ _f.extractall(save_path)
1057
+ _f.close()
1058
+ for _root, _dirs, _files in os.walk(save_path, topdown=False):
1059
+ for _name in _files:
1060
+ if '~$' in _name or 'DS_Store' in _name:
1061
+ continue
1062
+ try:
1063
+ _new_root = _root.encode('cp437').decode('utf-8')
1064
+ _new_name = _name.encode('cp437').decode('utf-8')
1065
+ except:
1066
+ _new_root = _root.encode('utf-8').decode('utf-8')
1067
+ _new_name = _name.encode('utf-8').decode('utf-8')
1068
+ _old = os.path.join(_root, _name)
1069
+ _new = os.path.join(_new_root, _new_name)
1070
+ if _new_root != _root: # 目录乱码,创建新目录
1071
+ os.makedirs(_new_root, exist_ok=True)
1072
+ os.rename(_old, _new)
1073
+ try:
1074
+ _new_root = _root.encode('cp437').decode('utf-8')
1075
+ except:
1076
+ _new_root = _root.encode('utf-8').decode('utf-8')
1077
+ if _new_root != _root or '__MACOSX' in _root:
1078
+ shutil.rmtree(_root)
1079
+
1080
+ def upload_df(self, service_databases=None, path=None):
1081
+ """
1082
+ 将清洗后的 df 上传数据库, copysh.py 调用
1083
+ """
1084
+ if not service_databases:
1085
+ service_databases = self.service_databases
1086
+ df_to_json = df_types.DataTypes() # json 文件, 包含数据的 dtypes 信息
1087
+ for service_database in service_databases:
1088
+ for service_name, database in service_database.items():
1089
+ # print(service_name, database)
1090
+ if database == 'mongodb':
1091
+ username, password, host, port = get_myconf.select_config_values(
1092
+ target_service=service_name,
1093
+ database=database,
1094
+ )
1095
+ d = mongo.UploadMongo(
1096
+ username=username,
1097
+ password=password,
1098
+ host=host,
1099
+ port=port,
1100
+ drop_duplicates=False,
1101
+ )
1102
+ for data in self.datas:
1103
+ db_name, collection_name, df = data['数据库名'], data['集合名称'], data['数据主体']
1104
+ df_to_json.get_df_types(
1105
+ df=df,
1106
+ db_name=db_name,
1107
+ collection_name=collection_name,
1108
+ is_file_dtype=True, # 默认本地文件优先: True
1109
+ )
1110
+ d.df_to_mongo(df=df, db_name=db_name, collection_name=collection_name)
1111
+ if d.client:
1112
+ d.client.close()
1113
+
1114
+ elif database == 'mysql':
1115
+ username, password, host, port = get_myconf.select_config_values(
1116
+ target_service=service_name,
1117
+ database=database,
1118
+ )
1119
+ m = mysql.MysqlUpload(
1120
+ username=username,
1121
+ password=password,
1122
+ host=host,
1123
+ port=port,
1124
+ )
1125
+ for data in self.datas:
1126
+ df, db_name, collection_name, rt_filename = data['数据主体'], data['数据库名'], data['集合名称'], data['文件名']
1127
+ df_to_json.get_df_types(
1128
+ df=df,
1129
+ db_name=db_name,
1130
+ collection_name=collection_name,
1131
+ is_file_dtype=True, # 默认本地文件优先: True
1132
+ )
1133
+ m.df_to_mysql(
1134
+ df=df,
1135
+ db_name=db_name,
1136
+ table_name=collection_name,
1137
+ move_insert=True, # 先删除,再插入
1138
+ df_sql=False, # 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重
1139
+ drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
1140
+ filename=rt_filename, # 用来追踪处理进度
1141
+ service_database=service_database, # 字典
1142
+ )
1143
+ df_to_json.as_json_file() # 写入 json 文件, 包含数据的 dtypes 信息
1144
+
1145
+
1146
+ def main(service_databases=None):
1147
+ # 数据分类
1148
+
1149
+ if not service_databases:
1150
+ service_databases = [
1151
+ # {'home_lx': 'mongodb'},
1152
+ {'home_lx': 'mysql'},
1153
+ # {'company': 'mysql'},
1154
+ # {'nas': 'mysql'},
1155
+ ]
1156
+
1157
+ c = DataClean(
1158
+ path=upload_path, # 源文件目录,下载文件夹
1159
+ source_path=source_path3, # 原始文件保存目录
1160
+ service_databases=service_databases
1161
+ )
1162
+ c.new_unzip(is_move=True) # 解压文件, is_move 解压后是否删除原 zip 压缩文件
1163
+ c.tg_reports(is_except=['except']) # 推广报表,天猫淘宝共同清洗
1164
+ c.syj_reports_tm(is_except=['except']) # 天猫生意经
1165
+ # c.syj_reports_tb(is_except=['except']) # 淘宝生意经,不可以和天猫同时运行
1166
+ c.jd_reports(is_except=['except']) # 清洗京东报表
1167
+ c.sp_scene_clean(is_except=['except']) # 商品素材
1168
+ c.upload_df(service_databases=service_databases) # 上传数据库
1169
+
1170
+ c.move_sjy(is_except=['临时文件',]) # 生意经,移到文件到原始文件夹
1171
+ c.move_jd(is_except=['临时文件', ]) # 京东,移到文件到原始文件夹
1172
+ c.move_tg_tm(is_except=['临时文件', ]) # 天猫,移到文件到原始文件夹
1173
+ c.move_tg_tb(is_except=['临时文件', ]) # 淘宝店,移到文件到原始文件夹
1174
+
1175
+ # 更新货品年份基准表, 属性设置 2 - 货品年份基准
1176
+ p = products.Products()
1177
+ p.to_mysql(service_databases=service_databases)
1178
+
1179
+ # 清理所有非聚合数据的库
1180
+ optimize_data.op_data(
1181
+ db_name_lists=[
1182
+ '京东数据2',
1183
+ '推广数据2',
1184
+ '市场数据2',
1185
+ '生意参谋2',
1186
+ '生意经2',
1187
+ '属性设置2',
1188
+ # '聚合数据', # 不在这里清理聚合数据, 还未开始聚合呢
1189
+ '京东数据3',
1190
+ '天猫_推广数据3',
1191
+ '淘宝_推广数据3',
1192
+ # '市场数据3',
1193
+ # '生意参谋3',
1194
+ '天猫_生意经3',
1195
+ # '淘宝_生意经3',
1196
+ ],
1197
+ days=100,
1198
+ is_mongo=True,
1199
+ is_mysql=True,
1200
+ )
1201
+
1202
+ # 数据聚合
1203
+ query_data.data_aggregation(service_databases=service_databases, months=3)
1204
+ time.sleep(60)
1205
+
1206
+ # 清理聚合数据, mongodb 中没有聚合数据,所以只需要清理 mysql 即可
1207
+ optimize_data.op_data(
1208
+ db_name_lists=['聚合数据'],
1209
+ days=3650,
1210
+ service_databases=service_databases,
1211
+ is_mongo=False,
1212
+ is_mysql=True,
1213
+ )
1214
+
1215
+
1216
+ if __name__ == '__main__':
1217
+ main(
1218
+ service_databases = [
1219
+ # {'company': 'mysql'},
1220
+ {'home_lx': 'mysql'},
1221
+ # {'home_lx': 'mongodb'},
1222
+ # {'nas': 'mysql'},
1223
+ ]
1224
+ )
1225
+
1226
+ # c = DataClean(
1227
+ # path=upload_path, # 源文件目录,下载文件夹
1228
+ # source_path=source_path3, # 原始文件保存目录
1229
+ # service_databases=[{'home_lx': 'mysql'},]
1230
+ # )
1231
+ # c.sp_scene_clean(is_except=['except']) # 商品素材
1232
+ # c.move_tg_tm(is_except=['临时文件', ]) # 天猫,移到文件到原始文件夹
@@ -44,8 +44,12 @@ class DataFrameConverter(object):
44
44
  df.pop(col) # 等待插入的 df 不能包含 id 列,否则可能跟现有 id 主键冲突
45
45
  continue
46
46
 
47
- # 百分比在某些数据库中不兼容, 转换百分比为小数
48
- df[col] = df[col].apply(lambda x: float(float((str(x).rstrip("%"))) / 100) if str(x).endswith('%') and '~' not in str(x) else x)
47
+ try:
48
+ # 百分比在某些数据库中不兼容, 转换百分比为小数, # 转百分比的列不能含有中文或特殊字符
49
+ df[col] = df[col].apply(
50
+ lambda x: float(float((str(x).rstrip("%"))) / 100) if re.findall(r'^\d+%', str(x)) else x)
51
+ except Exception as e:
52
+ print(f'留意错误信息: 位于列 -> {col} -> {e}')
49
53
 
50
54
  if col.endswith('占比') or col.endswith('率'):
51
55
  df = df.astype({col: float}, errors='raise')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mdbq
3
- Version: 2.5.4
3
+ Version: 2.5.6
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -9,6 +9,7 @@ mdbq/aggregation/query_data.py,sha256=WKe42Xq1Gi-ELuIT0k2jh3X4-R7heb0ub3Mj3yuCRA
9
9
  mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
10
10
  mdbq/bdup/bdup.py,sha256=LAV0TgnQpc-LB-YuJthxb0U42_VkPidzQzAagan46lU,4234
11
11
  mdbq/clean/__init__.py,sha256=A1d6x3L27j4NtLgiFV5TANwEkLuaDfPHDQNrPBbNWtU,41
12
+ mdbq/clean/clean_upload.py,sha256=q_3kjiE0sU6uV13TW9rVuPmbO01itYhkC4gTnz_nZ5o,64455
12
13
  mdbq/clean/data_clean.py,sha256=ucfslhqXVZoH2QaXHSAWDky0GhIvH9f4GeNaHg4SrFE,104790
13
14
  mdbq/company/__init__.py,sha256=qz8F_GsP_pMB5PblgJAUAMjasuZbOEp3qQOCB39E8f0,21
14
15
  mdbq/company/copysh.py,sha256=NvlXCBZBcO2GIT5nLRYYqhOyHWM1-1RE7DHvgbj6jmQ,19723
@@ -19,7 +20,7 @@ mdbq/config/products.py,sha256=hN9UMkM6j76HYMulTYdtr3mOhh9QdpvvrLH14a_mbFY,5980
19
20
  mdbq/config/set_support.py,sha256=xkZCX6y9Bq1ppBpJAofld4B2YtchA7fl0eT3dx3CrSI,777
20
21
  mdbq/config/update_conf.py,sha256=taL3ZqKgiVWwUrDFuaYhim9a72Hm4BHRhhDscJTziR8,4535
21
22
  mdbq/dataframe/__init__.py,sha256=2HtCN8AdRj53teXDqzysC1h8aPL-mMFy561ESmhehGQ,22
22
- mdbq/dataframe/converter.py,sha256=OY5sMFzdF7wkfE59Es-urlZ2oJJY5nkao7009GSyVv4,4110
23
+ mdbq/dataframe/converter.py,sha256=u7rQbIsgVZWOIybJaknduf7wViBdBkyU8mwUo24mDt0,4304
23
24
  mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
24
25
  mdbq/log/mylogger.py,sha256=oaT7Bp-Hb9jZt52seP3ISUuxVcI19s4UiqTeouScBO0,3258
25
26
  mdbq/mongo/__init__.py,sha256=SILt7xMtQIQl_m-ik9WLtJSXIVf424iYgCfE_tnQFbw,13
@@ -41,7 +42,7 @@ mdbq/req_post/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
41
42
  mdbq/req_post/req_tb.py,sha256=PexWSCPJNM6Tv0ol4lAWIhlOwsAr_frnjtcdSHCFiek,36179
42
43
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
43
44
  mdbq/spider/aikucun.py,sha256=KdihSB3q44jsXUQAldfWRVfCSrEw2MNbM-_BhP_29g4,14448
44
- mdbq-2.5.4.dist-info/METADATA,sha256=QPhMknQuHoAlaRRB4M7H0h-GINpIWWzG7HOujYCQwHw,243
45
- mdbq-2.5.4.dist-info/WHEEL,sha256=cpQTJ5IWu9CdaPViMhC9YzF8gZuS5-vlfoFihTBC86A,91
46
- mdbq-2.5.4.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
47
- mdbq-2.5.4.dist-info/RECORD,,
45
+ mdbq-2.5.6.dist-info/METADATA,sha256=IIsrPFdnbkCfH4ziUSl_U05g3Cvc9vwyySZjLXN6SVU,243
46
+ mdbq-2.5.6.dist-info/WHEEL,sha256=cpQTJ5IWu9CdaPViMhC9YzF8gZuS5-vlfoFihTBC86A,91
47
+ mdbq-2.5.6.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
48
+ mdbq-2.5.6.dist-info/RECORD,,
File without changes