mdbq 2.5.4__py3-none-any.whl → 2.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/clean/clean_upload.py +1232 -0
- mdbq/dataframe/converter.py +6 -2
- {mdbq-2.5.4.dist-info → mdbq-2.5.6.dist-info}/METADATA +1 -1
- {mdbq-2.5.4.dist-info → mdbq-2.5.6.dist-info}/RECORD +6 -5
- {mdbq-2.5.4.dist-info → mdbq-2.5.6.dist-info}/WHEEL +0 -0
- {mdbq-2.5.4.dist-info → mdbq-2.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1232 @@
|
|
1
|
+
# -*- coding:utf-8 -*-
|
2
|
+
import warnings
|
3
|
+
import pandas as pd
|
4
|
+
from functools import wraps
|
5
|
+
import chardet
|
6
|
+
import zipfile
|
7
|
+
from pyzipper import PyZipFile
|
8
|
+
import os
|
9
|
+
import platform
|
10
|
+
import pathlib
|
11
|
+
import json
|
12
|
+
from mdbq.mongo import mongo
|
13
|
+
from mdbq.mysql import mysql
|
14
|
+
from mdbq.config import get_myconf
|
15
|
+
from mdbq.aggregation import df_types
|
16
|
+
from mdbq.config import products
|
17
|
+
from mdbq.aggregation import optimize_data
|
18
|
+
from mdbq.aggregation import query_data
|
19
|
+
import datetime
|
20
|
+
import time
|
21
|
+
import re
|
22
|
+
import shutil
|
23
|
+
import getpass
|
24
|
+
|
25
|
+
warnings.filterwarnings('ignore')
|
26
|
+
|
27
|
+
|
28
|
+
if platform.system() == 'Windows':
|
29
|
+
# windows版本
|
30
|
+
Data_Path = r'C:\同步空间\BaiduSyncdisk'
|
31
|
+
D_PATH = os.path.join(f'C:\\Users\\{getpass.getuser()}\\Downloads')
|
32
|
+
Share_Path = os.path.join(r'\\192.168.1.198\时尚事业部\01.运营部\天猫报表') # 共享文件根目录
|
33
|
+
elif platform.system() == 'Linux':
|
34
|
+
Data_Path = '数据中心'
|
35
|
+
D_PATH = 'Downloads'
|
36
|
+
if not os.path.exists(D_PATH):
|
37
|
+
os.makedirs(D_PATH)
|
38
|
+
Share_Path = '' # linux 通常是远程服务器,不需要访问共享
|
39
|
+
else:
|
40
|
+
Data_Path = f'/Users/{getpass.getuser()}/数据中心' # 使用Mac独立网络时
|
41
|
+
D_PATH = os.path.join(f'/Users/{getpass.getuser()}/Downloads')
|
42
|
+
Share_Path = os.path.join('/Volumes/时尚事业部/01.运营部/天猫报表') # 共享文件根目录
|
43
|
+
|
44
|
+
upload_path = os.path.join(D_PATH, '数据上传中心') # 此目录位于下载文件夹
|
45
|
+
source_path = os.path.join(Data_Path, '原始文件2') # 此目录保存下载并清洗过的文件,作为数据库备份
|
46
|
+
source_path3 = os.path.join(Data_Path, '原始文件3') # 此目录保存下载并清洗过的文件,作为数据库备份
|
47
|
+
|
48
|
+
|
49
|
+
class DataClean:
|
50
|
+
""" 数据分类 """
|
51
|
+
|
52
|
+
def __init__(self, path, source_path, service_databases):
|
53
|
+
self.path = path # 数据源位置,下载文件夹
|
54
|
+
self.source_path = source_path # 原始文件保存目录
|
55
|
+
self.datas = []
|
56
|
+
self.service_databases = service_databases
|
57
|
+
|
58
|
+
@staticmethod
|
59
|
+
def try_except(func): # 在类内部定义一个异常处理方法
|
60
|
+
@wraps(func)
|
61
|
+
def wrapper(*args, **kwargs):
|
62
|
+
try:
|
63
|
+
return func(*args, **kwargs)
|
64
|
+
except Exception as e:
|
65
|
+
print(f'{func.__name__}, {e}') # 将异常信息返回
|
66
|
+
|
67
|
+
return wrapper
|
68
|
+
|
69
|
+
@staticmethod
|
70
|
+
def get_encoding(file_path):
|
71
|
+
"""
|
72
|
+
获取文件的编码方式, 读取速度比较慢,非必要不要使用
|
73
|
+
"""
|
74
|
+
with open(file_path, 'rb') as f:
|
75
|
+
f1 = f.read()
|
76
|
+
encod = chardet.detect(f1).get('encoding')
|
77
|
+
return encod
|
78
|
+
|
79
|
+
@staticmethod
|
80
|
+
def save_to_csv(_df, _save_paths, filenames, encoding='utf-8_sig'):
|
81
|
+
if '.csv' not in filenames:
|
82
|
+
filenames = f'{filenames}.csv'
|
83
|
+
if not os.path.exists(_save_paths):
|
84
|
+
os.makedirs(_save_paths, exist_ok=True)
|
85
|
+
_df.to_csv(os.path.join(_save_paths, filenames), encoding=encoding, index=False, header=True)
|
86
|
+
|
87
|
+
def tg_reports(self, path=None, is_except=[]):
|
88
|
+
""" 处理天猫淘宝推广类报表 """
|
89
|
+
if not path:
|
90
|
+
path = self.path
|
91
|
+
report_names = [
|
92
|
+
{
|
93
|
+
'文件简称': 'tg_report_主体报表',
|
94
|
+
'数据库名': '推广数据3',
|
95
|
+
'集合名称': '主体报表',
|
96
|
+
},
|
97
|
+
{
|
98
|
+
'文件简称': 'tg_report_创意报表_创意',
|
99
|
+
'数据库名': '推广数据3',
|
100
|
+
'集合名称': '创意报表_创意',
|
101
|
+
},
|
102
|
+
{
|
103
|
+
'文件简称': 'tg_report_创意报表_素材',
|
104
|
+
'数据库名': '推广数据3',
|
105
|
+
'集合名称': '创意报表_素材',
|
106
|
+
},
|
107
|
+
{
|
108
|
+
'文件简称': 'tg_report_单元报表',
|
109
|
+
'数据库名': '推广数据3',
|
110
|
+
'集合名称': '单元报表',
|
111
|
+
},
|
112
|
+
{
|
113
|
+
'文件简称': 'tg_report_地域报表_省份',
|
114
|
+
'数据库名': '推广数据3',
|
115
|
+
'集合名称': '地域报表_省份',
|
116
|
+
},
|
117
|
+
{
|
118
|
+
'文件简称': 'tg_report_地域报表_城市',
|
119
|
+
'数据库名': '推广数据3',
|
120
|
+
'集合名称': '地域报表_城市',
|
121
|
+
},
|
122
|
+
{
|
123
|
+
'文件简称': 'tg_report_关键词报表',
|
124
|
+
'数据库名': '推广数据3',
|
125
|
+
'集合名称': '关键词报表',
|
126
|
+
},
|
127
|
+
{
|
128
|
+
'文件简称': 'tg_report_计划报表',
|
129
|
+
'数据库名': '推广数据3',
|
130
|
+
'集合名称': '计划报表',
|
131
|
+
},
|
132
|
+
{
|
133
|
+
'文件简称': 'tg_report_权益报表',
|
134
|
+
'数据库名': '推广数据3',
|
135
|
+
'集合名称': '权益报表',
|
136
|
+
},
|
137
|
+
{
|
138
|
+
'文件简称': 'tg_report_人群报表',
|
139
|
+
'数据库名': '推广数据3',
|
140
|
+
'集合名称': '人群报表',
|
141
|
+
},
|
142
|
+
{
|
143
|
+
'文件简称': 'tg_report_营销场景报表',
|
144
|
+
'数据库名': '推广数据3',
|
145
|
+
'集合名称': '营销场景报表',
|
146
|
+
},
|
147
|
+
{
|
148
|
+
'文件简称': 'tg_report_超级直播报表_人群',
|
149
|
+
'数据库名': '推广数据3',
|
150
|
+
'集合名称': '超级直播',
|
151
|
+
},
|
152
|
+
{
|
153
|
+
'文件简称': 'tg_report_品销宝_明星店铺',
|
154
|
+
'数据库名': '推广数据3',
|
155
|
+
'集合名称': '品销宝',
|
156
|
+
}
|
157
|
+
]
|
158
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
159
|
+
for name in files:
|
160
|
+
if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
|
161
|
+
continue
|
162
|
+
if 'py_xg' in name:
|
163
|
+
continue
|
164
|
+
is_continue = False
|
165
|
+
if is_except:
|
166
|
+
for item in is_except:
|
167
|
+
if item in os.path.join(root, name):
|
168
|
+
# print(name)
|
169
|
+
is_continue = True
|
170
|
+
break
|
171
|
+
if is_continue: # 需要排除不做处理的文件或文件夹
|
172
|
+
continue
|
173
|
+
|
174
|
+
# 这里排除掉非推广类报表
|
175
|
+
is_continue = False
|
176
|
+
db_name = None
|
177
|
+
collection_name = None
|
178
|
+
for item in report_names:
|
179
|
+
if item['文件简称'] in name:
|
180
|
+
db_name = item['数据库名']
|
181
|
+
collection_name = item['集合名称']
|
182
|
+
is_continue = True
|
183
|
+
if not is_continue:
|
184
|
+
continue
|
185
|
+
# 区分淘宝和天猫的报表
|
186
|
+
if '万里马官方旗舰店' in name:
|
187
|
+
db_name = f'天猫_{db_name}'
|
188
|
+
elif '万里马官方企业店' in name:
|
189
|
+
db_name = f'淘宝_{db_name}'
|
190
|
+
else:
|
191
|
+
print(f'报表名称错误,不属于天猫/淘宝店:{name}')
|
192
|
+
continue
|
193
|
+
|
194
|
+
if name.endswith('.csv'): # 推广类报表
|
195
|
+
if '明星店铺' in name: # 明星店铺可能会先释放 csv 文件
|
196
|
+
continue
|
197
|
+
encoding = self.get_encoding(file_path=os.path.join(root, name))
|
198
|
+
shop_name = re.findall(r'_([\u4e00-\u9fffA-Za-z]+店)', name)[0]
|
199
|
+
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
200
|
+
df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
|
201
|
+
df.fillna(0, inplace=True)
|
202
|
+
date_min = df["日期"].values.min()
|
203
|
+
date_max = df["日期"].values.max()
|
204
|
+
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
|
205
|
+
df.insert(loc=1, column='店铺名称', value=shop_name)
|
206
|
+
new_name = f'py_xg_{name}'
|
207
|
+
self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
|
208
|
+
os.remove(os.path.join(root, name))
|
209
|
+
elif name.endswith('.xlsx') and '品销宝_明星店铺' in name:
|
210
|
+
# 品销宝
|
211
|
+
sheets4 = ['账户', '推广计划', '推广单元', '创意', '品牌流量包', '定向人群'] # 品销宝
|
212
|
+
file_name4 = os.path.splitext(name)[0] # 明星店铺报表
|
213
|
+
for sheet4 in sheets4:
|
214
|
+
df = pd.read_excel(os.path.join(root, name), sheet_name=sheet4, header=0, engine='openpyxl')
|
215
|
+
if len(df) == 0:
|
216
|
+
print(f'{name} 报表数据为空')
|
217
|
+
os.remove(os.path.join(root, name))
|
218
|
+
continue
|
219
|
+
if len(df) < 1:
|
220
|
+
print(f'{name} 跳过')
|
221
|
+
continue
|
222
|
+
else:
|
223
|
+
shop_name = re.findall(r'明星店铺_([\u4e00-\u9fffA-Za-z]+店)', name)[0]
|
224
|
+
df.insert(loc=1, column='店铺名称', value=shop_name)
|
225
|
+
df.insert(loc=2, column='报表类型', value=sheet4)
|
226
|
+
df.fillna(0, inplace=True)
|
227
|
+
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore') # 转换日期列
|
228
|
+
# min_clm = str(df['日期'].min()).split(' ')[0]
|
229
|
+
# max_clm = str(df['日期'].max()).split(' ')[0]
|
230
|
+
new_file_name4 = f'{sheet4}_py_xg_{file_name4}.csv'
|
231
|
+
# 以sheet名进一步创建子文件夹
|
232
|
+
# root_new = os.path.join(self.source_path, '推广报表/品销宝', sheet4)
|
233
|
+
self.save_to_csv(df, upload_path, new_file_name4)
|
234
|
+
os.remove(os.path.join(root, name))
|
235
|
+
|
236
|
+
# 将数据传入 self.datas 等待更新进数据库
|
237
|
+
if not db_name or not collection_name:
|
238
|
+
print(f'db_name/collection_name 不能为空')
|
239
|
+
continue
|
240
|
+
self.datas.append(
|
241
|
+
{
|
242
|
+
'数据库名': db_name,
|
243
|
+
'集合名称': collection_name,
|
244
|
+
'数据主体': df,
|
245
|
+
'文件名': name,
|
246
|
+
}
|
247
|
+
)
|
248
|
+
|
249
|
+
def syj_reports_tm(self, path=None, is_except=[]):
|
250
|
+
""" 生意经报表 """
|
251
|
+
if not path:
|
252
|
+
path = self.path
|
253
|
+
report_names = [
|
254
|
+
{
|
255
|
+
'文件简称': 'baobei',
|
256
|
+
'数据库名': '天猫_生意经3',
|
257
|
+
'集合名称': '宝贝指标',
|
258
|
+
},
|
259
|
+
{
|
260
|
+
'文件简称': 'order',
|
261
|
+
'数据库名': '天猫_生意经3',
|
262
|
+
'集合名称': '订单指标',
|
263
|
+
},
|
264
|
+
{
|
265
|
+
'文件简称': '省份城市分析',
|
266
|
+
'数据库名': '天猫_生意经3',
|
267
|
+
'集合名称': '省份城市分析',
|
268
|
+
},
|
269
|
+
{
|
270
|
+
'文件简称': '店铺销售指标',
|
271
|
+
'数据库名': '天猫_生意经3',
|
272
|
+
'集合名称': '店铺销售指标',
|
273
|
+
},
|
274
|
+
]
|
275
|
+
|
276
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
277
|
+
for name in files:
|
278
|
+
if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
|
279
|
+
continue
|
280
|
+
if 'py_xg' in name:
|
281
|
+
continue
|
282
|
+
is_continue = False
|
283
|
+
if is_except:
|
284
|
+
for item in is_except:
|
285
|
+
if item in os.path.join(root, name):
|
286
|
+
# print(name)
|
287
|
+
is_continue = True
|
288
|
+
break
|
289
|
+
if is_continue: # 需要排除不做处理的文件或文件夹
|
290
|
+
continue
|
291
|
+
|
292
|
+
# 这里排除掉非目标报表
|
293
|
+
is_continue = False
|
294
|
+
db_name = None
|
295
|
+
collection_name = None
|
296
|
+
for item in report_names:
|
297
|
+
if item['文件简称'] in name:
|
298
|
+
db_name = item['数据库名']
|
299
|
+
collection_name = item['集合名称']
|
300
|
+
is_continue = True
|
301
|
+
if not is_continue:
|
302
|
+
continue
|
303
|
+
|
304
|
+
if name.endswith('.csv') and 'baobei' in name:
|
305
|
+
encoding = self.get_encoding(file_path=os.path.join(root, name))
|
306
|
+
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
307
|
+
pattern = re.findall(r'-(\d{4})(\d{2})(\d{2}).csv', name)[0]
|
308
|
+
df['日期'] = '-'.join(pattern)
|
309
|
+
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
310
|
+
new_name = f'py_xg_天猫_baobeitrains_{'-'.join(pattern)}.csv'
|
311
|
+
self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
|
312
|
+
os.remove(os.path.join(root, name))
|
313
|
+
elif name.endswith('.csv') and 'order' in name:
|
314
|
+
""" 这里不能使用表格原先的 gb2312, 会报错 """
|
315
|
+
# encoding = self.get_encoding(file_path=os.path.join(root, name))
|
316
|
+
df = pd.read_csv(os.path.join(root, name), encoding='gb18030', header=0, na_filter=False)
|
317
|
+
pattern = re.findall(r'(.*)(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})', name)[0]
|
318
|
+
date1 ='-'.join(pattern[1:4])
|
319
|
+
date2 = '-'.join(pattern[4:7])
|
320
|
+
df.insert(loc=0, column='日期', value=date1)
|
321
|
+
df.insert(loc=1, column='数据周期', value=f'{date1}_{date2}')
|
322
|
+
df.rename(columns={'宝贝标题': '商品标题', '宝贝链接': '商品链接'}, inplace=True)
|
323
|
+
df['颜色编码'] = df['商家编码'].apply(
|
324
|
+
lambda x: ''.join(re.findall(r' .*(\d{4})$', str(x))) if x else x)
|
325
|
+
new_name = f'py_xg_天猫_order_{date1}_{date2}.csv'
|
326
|
+
self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
|
327
|
+
os.remove(os.path.join(root, name))
|
328
|
+
elif name.endswith('.csv') and '省份城市分析' in name:
|
329
|
+
encoding = self.get_encoding(file_path=os.path.join(root, name))
|
330
|
+
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
331
|
+
pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})\.', name)[0]
|
332
|
+
date = '-'.join(pattern[1:])
|
333
|
+
new_name = f'py_xg_天猫_{pattern[0]}-{date}.csv'
|
334
|
+
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
335
|
+
if len(df) == 0:
|
336
|
+
print(f'{name} 报表数据为空')
|
337
|
+
os.remove(os.path.join(root, name))
|
338
|
+
continue
|
339
|
+
df['省'] = df['省份'].apply(lambda x: x if ' ├─ ' not in x and ' └─ ' not in x else None)
|
340
|
+
df['城市'] = df[['省份', '省']].apply(lambda x: '汇总' if x['省'] else x['省份'], axis=1)
|
341
|
+
df['省'].fillna(method='ffill', inplace=True)
|
342
|
+
df['城市'].replace(to_replace=[' ├─ | └─ '], value='', regex=True, inplace=True)
|
343
|
+
pov = df.pop('省')
|
344
|
+
city = df.pop('城市')
|
345
|
+
df['省+市'] = df['省份']
|
346
|
+
df['省份'] = pov
|
347
|
+
df.insert(loc=1, column='城市', value=city)
|
348
|
+
df.insert(loc=0, column='日期', value=date)
|
349
|
+
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
|
350
|
+
self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
|
351
|
+
os.remove(os.path.join(root, name))
|
352
|
+
elif name.endswith('.csv') and '店铺销售指标' in name:
|
353
|
+
# 生意经, 店铺指标,仅限月数据,实际日指标也可以
|
354
|
+
name_st = re.findall(r'(.*)\(分日', name)
|
355
|
+
if not name_st:
|
356
|
+
print(f'{name} 已转换的表格')
|
357
|
+
continue
|
358
|
+
encoding = self.get_encoding(file_path=os.path.join(root, name))
|
359
|
+
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
360
|
+
if len(df) == 0:
|
361
|
+
print(f'{name} 报表数据为空')
|
362
|
+
os.remove(os.path.join(root, name))
|
363
|
+
continue
|
364
|
+
df['日期'] = df['日期'].astype(str).apply(
|
365
|
+
lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', x)[0]) if x else x)
|
366
|
+
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore') # 转换日期列
|
367
|
+
# min_clm = str(df.min()['日期']).split(' ')[0]
|
368
|
+
# max_clm = str(df.max()['日期']).split(' ')[0]
|
369
|
+
min_clm = str(df['日期'].min()).split(' ')[0]
|
370
|
+
max_clm = str(df['日期'].max()).split(' ')[0]
|
371
|
+
new_name = f'py_xg_天猫_{name_st[0]}-{min_clm}_{max_clm}.csv' # 保存时将(分日)去掉
|
372
|
+
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
373
|
+
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
|
374
|
+
self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
|
375
|
+
os.remove(os.path.join(root, name))
|
376
|
+
|
377
|
+
# 将数据传入 self.datas 等待更新进数据库
|
378
|
+
if not db_name or not collection_name:
|
379
|
+
print(f'db_name/collection_name 不能为空')
|
380
|
+
continue
|
381
|
+
self.datas.append(
|
382
|
+
{
|
383
|
+
'数据库名': db_name,
|
384
|
+
'集合名称': collection_name,
|
385
|
+
'数据主体': df,
|
386
|
+
'文件名': name,
|
387
|
+
}
|
388
|
+
)
|
389
|
+
|
390
|
+
def syj_reports_tb(self, path=None, is_except=[]):
|
391
|
+
""" 淘宝店 生意经报表 """
|
392
|
+
if not path:
|
393
|
+
path = self.path
|
394
|
+
report_names = [
|
395
|
+
{
|
396
|
+
'文件简称': 'baobei',
|
397
|
+
'数据库名': '淘宝_生意经3',
|
398
|
+
'集合名称': '宝贝指标',
|
399
|
+
},
|
400
|
+
{
|
401
|
+
'文件简称': 'order',
|
402
|
+
'数据库名': '淘宝_生意经3',
|
403
|
+
'集合名称': '订单指标',
|
404
|
+
},
|
405
|
+
{
|
406
|
+
'文件简称': '省份城市分析',
|
407
|
+
'数据库名': '淘宝_生意经3',
|
408
|
+
'集合名称': '省份城市分析',
|
409
|
+
},
|
410
|
+
{
|
411
|
+
'文件简称': '店铺销售指标',
|
412
|
+
'数据库名': '淘宝_生意经3',
|
413
|
+
'集合名称': '店铺销售指标',
|
414
|
+
},
|
415
|
+
]
|
416
|
+
|
417
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
418
|
+
for name in files:
|
419
|
+
if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
|
420
|
+
continue
|
421
|
+
if 'py_xg' in name:
|
422
|
+
continue
|
423
|
+
is_continue = False
|
424
|
+
if is_except:
|
425
|
+
for item in is_except:
|
426
|
+
if item in os.path.join(root, name):
|
427
|
+
# print(name)
|
428
|
+
is_continue = True
|
429
|
+
break
|
430
|
+
if is_continue: # 需要排除不做处理的文件或文件夹
|
431
|
+
continue
|
432
|
+
|
433
|
+
# 这里排除掉非目标报表
|
434
|
+
is_continue = False
|
435
|
+
db_name = None
|
436
|
+
collection_name = None
|
437
|
+
for item in report_names:
|
438
|
+
if item['文件简称'] in name:
|
439
|
+
db_name = item['数据库名']
|
440
|
+
collection_name = item['集合名称']
|
441
|
+
is_continue = True
|
442
|
+
if not is_continue:
|
443
|
+
continue
|
444
|
+
|
445
|
+
if name.endswith('.csv') and 'baobei' in name:
|
446
|
+
encoding = self.get_encoding(file_path=os.path.join(root, name))
|
447
|
+
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
448
|
+
pattern = re.findall(r'-(\d{4})(\d{2})(\d{2}).csv', name)[0]
|
449
|
+
df['日期'] = '-'.join(pattern)
|
450
|
+
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
451
|
+
new_name = f'py_xg_淘宝_baobeitrains_{'-'.join(pattern)}.csv'
|
452
|
+
self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
|
453
|
+
os.remove(os.path.join(root, name))
|
454
|
+
elif name.endswith('.csv') and 'order' in name:
|
455
|
+
""" 这里不能使用表格原先的 gb2312, 会报错 """
|
456
|
+
# encoding = self.get_encoding(file_path=os.path.join(root, name))
|
457
|
+
df = pd.read_csv(os.path.join(root, name), encoding='gb18030', header=0, na_filter=False)
|
458
|
+
pattern = re.findall(r'(.*)(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})', name)[0]
|
459
|
+
date1 ='-'.join(pattern[1:4])
|
460
|
+
date2 = '-'.join(pattern[4:7])
|
461
|
+
df.insert(loc=0, column='日期', value=date1)
|
462
|
+
df.insert(loc=1, column='数据周期', value=f'{date1}_{date2}')
|
463
|
+
df.rename(columns={'宝贝标题': '商品标题', '宝贝链接': '商品链接'}, inplace=True)
|
464
|
+
df['颜色编码'] = df['商家编码'].apply(
|
465
|
+
lambda x: ''.join(re.findall(r' .*(\d{4})$', str(x))) if x else x)
|
466
|
+
new_name = f'py_xg_淘宝_order_{date1}_{date2}.csv'
|
467
|
+
self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
|
468
|
+
os.remove(os.path.join(root, name))
|
469
|
+
elif name.endswith('.csv') and '省份城市分析' in name:
|
470
|
+
encoding = self.get_encoding(file_path=os.path.join(root, name))
|
471
|
+
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
472
|
+
pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})\.', name)[0]
|
473
|
+
date = '-'.join(pattern[1:])
|
474
|
+
new_name = f'py_xg_淘宝_{pattern[0]}-{date}.csv'
|
475
|
+
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
476
|
+
if len(df) == 0:
|
477
|
+
print(f'{name} 报表数据为空')
|
478
|
+
os.remove(os.path.join(root, name))
|
479
|
+
continue
|
480
|
+
df['省'] = df['省份'].apply(lambda x: x if ' ├─ ' not in x and ' └─ ' not in x else None)
|
481
|
+
df['城市'] = df[['省份', '省']].apply(lambda x: '汇总' if x['省'] else x['省份'], axis=1)
|
482
|
+
df['省'].fillna(method='ffill', inplace=True)
|
483
|
+
df['城市'].replace(to_replace=[' ├─ | └─ '], value='', regex=True, inplace=True)
|
484
|
+
pov = df.pop('省')
|
485
|
+
city = df.pop('城市')
|
486
|
+
df['省+市'] = df['省份']
|
487
|
+
df['省份'] = pov
|
488
|
+
df.insert(loc=1, column='城市', value=city)
|
489
|
+
df.insert(loc=0, column='日期', value=date)
|
490
|
+
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
|
491
|
+
self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
|
492
|
+
os.remove(os.path.join(root, name))
|
493
|
+
elif name.endswith('.csv') and '店铺销售指标' in name:
|
494
|
+
# 生意经, 店铺指标,仅限月数据,实际日指标也可以
|
495
|
+
name_st = re.findall(r'(.*)\(分日', name)
|
496
|
+
if not name_st:
|
497
|
+
print(f'{name} 已转换的表格')
|
498
|
+
continue
|
499
|
+
encoding = self.get_encoding(file_path=os.path.join(root, name))
|
500
|
+
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
501
|
+
if len(df) == 0:
|
502
|
+
print(f'{name} 报表数据为空')
|
503
|
+
os.remove(os.path.join(root, name))
|
504
|
+
continue
|
505
|
+
df['日期'] = df['日期'].astype(str).apply(
|
506
|
+
lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', x)[0]) if x else x)
|
507
|
+
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore') # 转换日期列
|
508
|
+
# min_clm = str(df.min()['日期']).split(' ')[0]
|
509
|
+
# max_clm = str(df.max()['日期']).split(' ')[0]
|
510
|
+
min_clm = str(df['日期'].min()).split(' ')[0]
|
511
|
+
max_clm = str(df['日期'].max()).split(' ')[0]
|
512
|
+
new_name = f'py_xg_淘宝_{name_st[0]}-{min_clm}_{max_clm}.csv' # 保存时将(分日)去掉
|
513
|
+
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
514
|
+
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
|
515
|
+
self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
|
516
|
+
os.remove(os.path.join(root, name))
|
517
|
+
|
518
|
+
# 将数据传入 self.datas 等待更新进数据库
|
519
|
+
if not db_name or not collection_name:
|
520
|
+
print(f'db_name/collection_name 不能为空')
|
521
|
+
continue
|
522
|
+
self.datas.append(
|
523
|
+
{
|
524
|
+
'数据库名': db_name,
|
525
|
+
'集合名称': collection_name,
|
526
|
+
'数据主体': df,
|
527
|
+
'文件名': name,
|
528
|
+
}
|
529
|
+
)
|
530
|
+
|
531
|
+
def jd_reports(self, path=None, is_except=[]):
|
532
|
+
""" 处理京东报表 """
|
533
|
+
if not path:
|
534
|
+
path = self.path
|
535
|
+
report_names = [
|
536
|
+
{
|
537
|
+
'文件简称': '京东推广_点击成交',
|
538
|
+
'数据库名': '京东数据3',
|
539
|
+
'集合名称': '推广数据_京准通',
|
540
|
+
},
|
541
|
+
{
|
542
|
+
'文件简称': '京东推广_搜索词',
|
543
|
+
'数据库名': '京东数据3',
|
544
|
+
'集合名称': '推广数据_搜索词报表',
|
545
|
+
},
|
546
|
+
{
|
547
|
+
'文件简称': '京东推广_关键词',
|
548
|
+
'数据库名': '京东数据3',
|
549
|
+
'集合名称': '推广数据_关键词报表',
|
550
|
+
},
|
551
|
+
{
|
552
|
+
'文件简称': '京东商智_sku_商品明细',
|
553
|
+
'数据库名': '京东数据3',
|
554
|
+
'集合名称': '京东商智_sku_商品明细',
|
555
|
+
},
|
556
|
+
{
|
557
|
+
'文件简称': '京东商智_spu_商品明细',
|
558
|
+
'数据库名': '京东数据3',
|
559
|
+
'集合名称': '京东商智_spu_商品明细',
|
560
|
+
},
|
561
|
+
{
|
562
|
+
'文件简称': '京东商智_店铺来源_三级来源',
|
563
|
+
'数据库名': '京东数据3',
|
564
|
+
'集合名称': '京东商智_店铺来源',
|
565
|
+
},
|
566
|
+
]
|
567
|
+
|
568
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
569
|
+
for name in files:
|
570
|
+
if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
|
571
|
+
continue
|
572
|
+
if 'py_xg' in name:
|
573
|
+
continue
|
574
|
+
is_continue = False
|
575
|
+
if is_except:
|
576
|
+
for item in is_except:
|
577
|
+
if item in os.path.join(root, name):
|
578
|
+
# print(name)
|
579
|
+
is_continue = True
|
580
|
+
break
|
581
|
+
if is_continue: # 需要排除不做处理的文件或文件夹
|
582
|
+
continue
|
583
|
+
|
584
|
+
# 这里排除掉非目标报表
|
585
|
+
is_continue = False
|
586
|
+
db_name = None
|
587
|
+
collection_name = None
|
588
|
+
for item in report_names:
|
589
|
+
if item['文件简称'] in name:
|
590
|
+
db_name = item['数据库名']
|
591
|
+
collection_name = item['集合名称']
|
592
|
+
is_continue = True
|
593
|
+
if not is_continue:
|
594
|
+
continue
|
595
|
+
|
596
|
+
if name.endswith('.xlsx') and '京东推广_' in name:
|
597
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
598
|
+
new_name = f'py_xg_{name}'
|
599
|
+
os.rename(os.path.join(root, name), os.path.join(root, new_name))
|
600
|
+
elif name.endswith('.xlsx') and '京东商智_sku_商品明细' in name:
|
601
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
602
|
+
df.replace(to_replace=['-'], value='', regex=False, inplace=True)
|
603
|
+
pattern = re.findall(r'_(\d{4}-\d{2}-\d{2})', name)[0]
|
604
|
+
df.insert(loc=0, column='日期', value=pattern)
|
605
|
+
df.insert(loc=1, column='店铺名称', value='京东箱包旗舰店')
|
606
|
+
df.fillna(0, inplace=True)
|
607
|
+
new_name = f'py_xg_{name}'
|
608
|
+
df.to_excel(os.path.join(upload_path, new_name),
|
609
|
+
index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
|
610
|
+
os.remove(os.path.join(root, name))
|
611
|
+
elif name.endswith('.xlsx') and '京东商智_spu_商品明细' in name:
|
612
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
613
|
+
df.replace(to_replace=['-'], value='', regex=False, inplace=True)
|
614
|
+
pattern = re.findall(r'_(\d{4}-\d{2}-\d{2})', name)[0]
|
615
|
+
df.insert(loc=0, column='日期', value=pattern)
|
616
|
+
df.insert(loc=1, column='店铺名称', value='京东箱包旗舰店')
|
617
|
+
df.fillna(0, inplace=True)
|
618
|
+
new_name = f'py_xg_{name}'
|
619
|
+
df.to_excel(os.path.join(upload_path, new_name),
|
620
|
+
index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
|
621
|
+
os.remove(os.path.join(root, name))
|
622
|
+
elif name.endswith('.xlsx') and '京东商智_店铺来源_三级来源' in name:
|
623
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
624
|
+
df.replace(to_replace=['-'], value='', regex=False, inplace=True)
|
625
|
+
df.rename(columns={'时间': '日期'}, inplace=True)
|
626
|
+
for col in df.columns.tolist():
|
627
|
+
if '环比' in col or '同比' in col:
|
628
|
+
df.drop(col, axis=1, inplace=True)
|
629
|
+
df.fillna(0, inplace=True)
|
630
|
+
new_name = f'py_xg_{name}'
|
631
|
+
df.to_excel(os.path.join(upload_path, new_name),
|
632
|
+
index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
|
633
|
+
os.remove(os.path.join(root, name))
|
634
|
+
|
635
|
+
# 将数据传入 self.datas 等待更新进数据库
|
636
|
+
if not db_name or not collection_name:
|
637
|
+
print(f'db_name/collection_name 不能为空')
|
638
|
+
continue
|
639
|
+
# print(name)
|
640
|
+
self.datas.append(
|
641
|
+
{
|
642
|
+
'数据库名': db_name,
|
643
|
+
'集合名称': collection_name,
|
644
|
+
'数据主体': df,
|
645
|
+
'文件名': name,
|
646
|
+
}
|
647
|
+
)
|
648
|
+
|
649
|
+
def sp_scene_clean(self, path=None, is_except=[]):
|
650
|
+
if not path:
|
651
|
+
path = self.path
|
652
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
653
|
+
for name in files:
|
654
|
+
if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
|
655
|
+
continue
|
656
|
+
if 'py_xg' in name:
|
657
|
+
continue
|
658
|
+
is_continue = False
|
659
|
+
if is_except:
|
660
|
+
for item in is_except:
|
661
|
+
if item in os.path.join(root, name):
|
662
|
+
# print(name)
|
663
|
+
is_continue = True
|
664
|
+
break
|
665
|
+
if is_continue: # 需要排除不做处理的文件或文件夹
|
666
|
+
continue
|
667
|
+
|
668
|
+
if name.endswith('.xlsx') and '商品素材_' in name:
|
669
|
+
shop_name = re.findall(r'_([\u4e00-\u9fffA-Za-z]+店)_', name)[0]
|
670
|
+
df = pd.read_excel(os.path.join(root, name), header=0)
|
671
|
+
df.insert(loc=1, column='店铺名称', value=shop_name)
|
672
|
+
new_name = f'py_xg_{name}'
|
673
|
+
df.to_excel(os.path.join(upload_path, new_name),
|
674
|
+
index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
|
675
|
+
if '官方旗舰店' in name:
|
676
|
+
db_name = '属性设置3'
|
677
|
+
collection_name = '商品素材_天猫'
|
678
|
+
elif '官方企业店' in name:
|
679
|
+
db_name = '属性设置3'
|
680
|
+
collection_name = '商品素材_淘宝'
|
681
|
+
os.remove(os.path.join(root, name))
|
682
|
+
|
683
|
+
# 将数据传入 self.datas 等待更新进数据库
|
684
|
+
if not db_name or not collection_name:
|
685
|
+
print(f'db_name/collection_name 不能为空')
|
686
|
+
continue
|
687
|
+
self.datas.append(
|
688
|
+
{
|
689
|
+
'数据库名': db_name,
|
690
|
+
'集合名称': collection_name,
|
691
|
+
'数据主体': df,
|
692
|
+
'文件名': name,
|
693
|
+
}
|
694
|
+
)
|
695
|
+
"""
|
696
|
+
{文件分类}
|
697
|
+
将已处理完的文件 分类移到原始文件夹下
|
698
|
+
此处t_path参数定义了子文件夹的生成名称
|
699
|
+
"""
|
700
|
+
|
701
|
+
@staticmethod
|
702
|
+
def move_files(path, _name, target_path, _as_month=None):
|
703
|
+
"""
|
704
|
+
name: 移动的文件名,
|
705
|
+
target_path: 目标位置
|
706
|
+
"""
|
707
|
+
t2 = target_path # t2 赋值有用, 不能省略
|
708
|
+
if not os.path.exists(t2): # 如果目录不存在则创建
|
709
|
+
os.makedirs(t2, exist_ok=True)
|
710
|
+
if _as_month:
|
711
|
+
_date = re.findall(r'(\d{4}-\d{2})-\d{2}', str(_name))
|
712
|
+
if _date:
|
713
|
+
_date = _date[0]
|
714
|
+
t2 = pathlib.Path(t2, _date) # 添加 年月分类
|
715
|
+
if not os.path.exists(t2):
|
716
|
+
os.makedirs(t2, exist_ok=True)
|
717
|
+
old_file = os.path.join(t2, _name) # 检查目标位置是否已经存在该文件
|
718
|
+
if os.path.isfile(old_file):
|
719
|
+
os.remove(old_file) # 如果存在则移除
|
720
|
+
shutil.move(os.path.join(path, _name), t2) # 将文件从下载文件夹移到目标位置
|
721
|
+
|
722
|
+
# @try_except
|
723
|
+
def move_sjy(self, path=None, is_except=[]):
|
724
|
+
if not path:
|
725
|
+
path = self.path
|
726
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
727
|
+
for name in files:
|
728
|
+
# print(name)
|
729
|
+
is_continue = False
|
730
|
+
if is_except:
|
731
|
+
for item in is_except:
|
732
|
+
# print(item, f'-----', os.path.join(root, name))
|
733
|
+
if item in os.path.join(root, name):
|
734
|
+
# print(name)
|
735
|
+
is_continue = True
|
736
|
+
break
|
737
|
+
if is_continue: # 需要排除不做处理的文件或文件夹
|
738
|
+
continue
|
739
|
+
|
740
|
+
# print(is_except, is_continue)
|
741
|
+
def bib(paths, _as_month=None):
|
742
|
+
"""闭包函数"""
|
743
|
+
self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
|
744
|
+
|
745
|
+
if 'py_xg' not in name: # 排除非目标文件
|
746
|
+
continue
|
747
|
+
|
748
|
+
if '天猫' in name and name.endswith('.csv') and 'baobei' in name:
|
749
|
+
t_path = os.path.join(self.source_path, '天猫_生意经', '宝贝指标')
|
750
|
+
bib(t_path, _as_month=True)
|
751
|
+
elif '天猫' in name and name.endswith('.csv') and '省份城市分析' in name:
|
752
|
+
t_path = os.path.join(self.source_path, '天猫_生意经', '省份城市分析')
|
753
|
+
bib(t_path, _as_month=True)
|
754
|
+
elif '天猫' in name and name.endswith('.csv') and '店铺销售指标' in name:
|
755
|
+
t_path = os.path.join(self.source_path, '天猫_生意经', '店铺销售指标')
|
756
|
+
bib(t_path, _as_month=False)
|
757
|
+
elif '天猫' in name and name.endswith('.csv') and 'order' in name:
|
758
|
+
t_path = os.path.join(self.source_path, '天猫_生意经', '订单数据')
|
759
|
+
bib(t_path, _as_month=False)
|
760
|
+
elif '淘宝' in name or '企业店' in name and name.endswith('.csv') and 'baobei' in name:
|
761
|
+
t_path = os.path.join(self.source_path, '淘宝_生意经', '宝贝指标')
|
762
|
+
bib(t_path, _as_month=True)
|
763
|
+
elif '淘宝' in name or '企业店' in name and name.endswith('.csv') and '省份城市分析' in name:
|
764
|
+
t_path = os.path.join(self.source_path, '淘宝_生意经', '省份城市分析')
|
765
|
+
bib(t_path, _as_month=True)
|
766
|
+
elif '淘宝' in name or '企业店' in name and name.endswith('.csv') and '店铺销售指标' in name:
|
767
|
+
t_path = os.path.join(self.source_path, '淘宝_生意经', '店铺销售指标')
|
768
|
+
bib(t_path, _as_month=False)
|
769
|
+
elif '淘宝' in name or '企业店' in name and name.endswith('.csv') and 'order' in name:
|
770
|
+
t_path = os.path.join(self.source_path, '淘宝_生意经', '订单数据')
|
771
|
+
bib(t_path, _as_month=False)
|
772
|
+
|
773
|
+
# @try_except
|
774
|
+
def move_jd(self, path=None, is_except=[]):
|
775
|
+
if not path:
|
776
|
+
path = self.path
|
777
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
778
|
+
for name in files:
|
779
|
+
# print(name)
|
780
|
+
is_continue = False
|
781
|
+
if is_except:
|
782
|
+
for item in is_except:
|
783
|
+
# print(item, f'-----', os.path.join(root, name))
|
784
|
+
if item in os.path.join(root, name):
|
785
|
+
# print(name)
|
786
|
+
is_continue = True
|
787
|
+
break
|
788
|
+
if is_continue: # 需要排除不做处理的文件或文件夹
|
789
|
+
continue
|
790
|
+
|
791
|
+
# print(is_except, is_continue)
|
792
|
+
def bib(paths, _as_month=None):
|
793
|
+
"""闭包函数"""
|
794
|
+
self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
|
795
|
+
|
796
|
+
if 'py_xg' not in name: # 排除非目标文件
|
797
|
+
continue
|
798
|
+
|
799
|
+
if name.endswith('.xlsx') and '京东商智_spu_商品明细' in name:
|
800
|
+
t_path = os.path.join(self.source_path, '京东报表', 'spu_商品明细')
|
801
|
+
bib(t_path, _as_month=True)
|
802
|
+
elif name.endswith('.xlsx') and '京东商智_sku_商品明细' in name:
|
803
|
+
t_path = os.path.join(self.source_path, '京东报表', 'sku_商品明细')
|
804
|
+
bib(t_path, _as_month=True)
|
805
|
+
elif name.endswith('.xlsx') and '京东推广_搜索词' in name:
|
806
|
+
t_path = os.path.join(self.source_path, '京东报表', '搜索词报表')
|
807
|
+
bib(t_path, _as_month=True)
|
808
|
+
elif name.endswith('.xlsx') and '京东推广_点击成交' in name:
|
809
|
+
t_path = os.path.join(self.source_path, '京东报表', '推广报表')
|
810
|
+
bib(t_path, _as_month=True)
|
811
|
+
elif name.endswith('.xlsx') and '京东推广_关键词点击' in name:
|
812
|
+
t_path = os.path.join(self.source_path, '京东报表', '关键词报表')
|
813
|
+
bib(t_path, _as_month=True)
|
814
|
+
elif name.endswith('.xlsx') and '京东商智_店铺来源_三级来源' in name:
|
815
|
+
t_path = os.path.join(self.source_path, '京东报表', '店铺来源_三级来源')
|
816
|
+
bib(t_path, _as_month=True)
|
817
|
+
|
818
|
+
# @try_except
|
819
|
+
def move_tg_tm(self, path=None, is_except=[]):
|
820
|
+
if not path:
|
821
|
+
path = self.path
|
822
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
823
|
+
for name in files:
|
824
|
+
# print(name)
|
825
|
+
is_continue = False
|
826
|
+
if is_except:
|
827
|
+
for item in is_except:
|
828
|
+
# print(item, f'-----', os.path.join(root, name))
|
829
|
+
if item in os.path.join(root, name):
|
830
|
+
# print(name)
|
831
|
+
is_continue = True
|
832
|
+
break
|
833
|
+
if is_continue: # 需要排除不做处理的文件或文件夹
|
834
|
+
continue
|
835
|
+
# print(is_except, is_continue)
|
836
|
+
def bib(paths, _as_month=None):
|
837
|
+
"""闭包函数"""
|
838
|
+
self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
|
839
|
+
|
840
|
+
if 'py_xg' not in name: # 排除非目标文件
|
841
|
+
continue
|
842
|
+
|
843
|
+
if name.endswith('.csv') and 'tg_report_主体报表_万里马官方旗舰店' in name:
|
844
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '主体报表')
|
845
|
+
bib(t_path, _as_month=True)
|
846
|
+
elif name.endswith('.csv') and 'tg_report_营销场景报表_万里马官方旗舰店' in name:
|
847
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '营销场景报表')
|
848
|
+
bib(t_path, _as_month=True)
|
849
|
+
elif name.endswith('.csv') and 'tg_report_人群报表_万里马官方旗舰店' in name:
|
850
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '人群报表')
|
851
|
+
bib(t_path, _as_month=True)
|
852
|
+
elif name.endswith('.csv') and 'tg_report_权益报表_万里马官方旗舰店' in name:
|
853
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '权益报表')
|
854
|
+
bib(t_path, _as_month=True)
|
855
|
+
elif name.endswith('.csv') and 'tg_report_计划报表_万里马官方旗舰店' in name:
|
856
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '计划报表')
|
857
|
+
bib(t_path, _as_month=True)
|
858
|
+
elif name.endswith('.csv') and 'tg_report_关键词报表_万里马官方旗舰店' in name:
|
859
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '关键词报表')
|
860
|
+
bib(t_path, _as_month=True)
|
861
|
+
elif name.endswith('.csv') and 'tg_report_地域报表_省份_万里马官方旗舰店' in name:
|
862
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '地域报表_省份')
|
863
|
+
bib(t_path, _as_month=True)
|
864
|
+
elif name.endswith('.csv') and 'tg_report_地域报表_城市_万里马官方旗舰店' in name:
|
865
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '地域报表_城市')
|
866
|
+
bib(t_path, _as_month=True)
|
867
|
+
elif name.endswith('.csv') and 'tg_report_单元报表_万里马官方旗舰店' in name:
|
868
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '单元报表')
|
869
|
+
bib(t_path, _as_month=True)
|
870
|
+
elif name.endswith('.csv') and 'tg_report_创意报表_素材粒度_万里马官方旗舰店' in name:
|
871
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '创意报表_素材粒度')
|
872
|
+
bib(t_path, _as_month=True)
|
873
|
+
elif name.endswith('.csv') and 'tg_report_创意报表_创意粒度_万里马官方旗舰店' in name:
|
874
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '创意报表_创意粒度')
|
875
|
+
bib(t_path, _as_month=True)
|
876
|
+
elif name.endswith('.csv') and 'tg_report_超级直播报表_人群_万里马官方旗舰店' in name:
|
877
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '超级直播报表_人群')
|
878
|
+
bib(t_path, _as_month=True)
|
879
|
+
|
880
|
+
elif name.endswith('.csv') and 'tg_report_品销宝_明星店铺_万里马官方旗舰店' in name:
|
881
|
+
if '账户' in name:
|
882
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '品销宝', '账户报表')
|
883
|
+
bib(t_path, _as_month=True)
|
884
|
+
elif '推广计划' in name:
|
885
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '品销宝', '推广计划报表')
|
886
|
+
bib(t_path, _as_month=True)
|
887
|
+
elif '推广单元' in name:
|
888
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '品销宝', '推广单元报表')
|
889
|
+
bib(t_path, _as_month=True)
|
890
|
+
elif '创意' in name:
|
891
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '品销宝', '创意报表')
|
892
|
+
bib(t_path, _as_month=True)
|
893
|
+
elif '品牌流量包' in name:
|
894
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '品销宝', '品牌流量包报表')
|
895
|
+
bib(t_path, _as_month=True)
|
896
|
+
elif '定向人群' in name:
|
897
|
+
t_path = os.path.join(self.source_path, '天猫推广报表', '品销宝', '定向人群报表')
|
898
|
+
bib(t_path, _as_month=True)
|
899
|
+
elif name.endswith('xlsx') and '商品素材_万里马官方旗舰店' in name:
|
900
|
+
t_path = os.path.join(self.source_path, '商品素材', '天猫')
|
901
|
+
bib(t_path, _as_month=True)
|
902
|
+
elif name.endswith('xlsx') and '商品素材_万里马官方企业店' in name:
|
903
|
+
t_path = os.path.join(self.source_path, '商品素材', '淘宝')
|
904
|
+
bib(t_path, _as_month=True)
|
905
|
+
|
906
|
+
# @try_except
|
907
|
+
def move_tg_tb(self, path=None, is_except=[]):
|
908
|
+
if not path:
|
909
|
+
path = self.path
|
910
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
911
|
+
for name in files:
|
912
|
+
# print(name)
|
913
|
+
is_continue = False
|
914
|
+
if is_except:
|
915
|
+
for item in is_except:
|
916
|
+
# print(item, f'-----', os.path.join(root, name))
|
917
|
+
if item in os.path.join(root, name):
|
918
|
+
# print(name)
|
919
|
+
is_continue = True
|
920
|
+
break
|
921
|
+
if is_continue: # 需要排除不做处理的文件或文件夹
|
922
|
+
continue
|
923
|
+
|
924
|
+
# print(is_except, is_continue)
|
925
|
+
def bib(paths, _as_month=None):
|
926
|
+
"""闭包函数"""
|
927
|
+
self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
|
928
|
+
|
929
|
+
if 'py_xg' not in name: # 排除非目标文件
|
930
|
+
continue
|
931
|
+
|
932
|
+
if name.endswith('.csv') and 'tg_report_主体报表_万里马官方企业店' in name:
|
933
|
+
t_path = os.path.join(self.source_path, '淘宝推广报表', '主体报表')
|
934
|
+
bib(t_path, _as_month=True)
|
935
|
+
elif name.endswith('.csv') and 'tg_report_营销场景报表_万里马官方企业店' in name:
|
936
|
+
t_path = os.path.join(self.source_path, '淘宝推广报表', '营销场景报表')
|
937
|
+
bib(t_path, _as_month=True)
|
938
|
+
elif name.endswith('.csv') and 'tg_report_人群报表_万里马官方企业店' in name:
|
939
|
+
t_path = os.path.join(self.source_path, '淘宝推广报表', '人群报表')
|
940
|
+
bib(t_path, _as_month=True)
|
941
|
+
elif name.endswith('.csv') and 'tg_report_权益报表_万里马官方企业店' in name:
|
942
|
+
t_path = os.path.join(self.source_path, '淘宝推广报表', '权益报表')
|
943
|
+
bib(t_path, _as_month=True)
|
944
|
+
elif name.endswith('.csv') and 'tg_report_计划报表_万里马官方企业店' in name:
|
945
|
+
t_path = os.path.join(self.source_path, '淘宝推广报表', '计划报表')
|
946
|
+
bib(t_path, _as_month=True)
|
947
|
+
elif name.endswith('.csv') and 'tg_report_关键词报表_万里马官方企业店' in name:
|
948
|
+
t_path = os.path.join(self.source_path, '淘宝推广报表', '关键词报表')
|
949
|
+
bib(t_path, _as_month=True)
|
950
|
+
elif name.endswith('.csv') and 'tg_report_地域报表_省份_万里马官方企业店' in name:
|
951
|
+
t_path = os.path.join(self.source_path, '淘宝推广报表', '地域报表_省份')
|
952
|
+
bib(t_path, _as_month=True)
|
953
|
+
elif name.endswith('.csv') and 'tg_report_地域报表_城市_万里马官方企业店' in name:
|
954
|
+
t_path = os.path.join(self.source_path, '淘宝推广报表', '地域报表_城市')
|
955
|
+
bib(t_path, _as_month=True)
|
956
|
+
elif name.endswith('.csv') and 'tg_report_单元报表_万里马官方企业店' in name:
|
957
|
+
t_path = os.path.join(self.source_path, '淘宝推广报表', '单元报表')
|
958
|
+
bib(t_path, _as_month=True)
|
959
|
+
elif name.endswith('.csv') and 'tg_report_创意报表_素材粒度_万里马官方企业店' in name:
|
960
|
+
t_path = os.path.join(self.source_path, '淘宝推广报表', '创意报表_素材粒度')
|
961
|
+
bib(t_path, _as_month=True)
|
962
|
+
elif name.endswith('.csv') and 'tg_report_创意报表_创意粒度_万里马官方企业店' in name:
|
963
|
+
t_path = os.path.join(self.source_path, '淘宝推广报表', '创意报表_创意粒度')
|
964
|
+
bib(t_path, _as_month=True)
|
965
|
+
elif name.endswith('.csv') and 'tg_report_超级直播报表_万里马官方企业店' in name:
|
966
|
+
t_path = os.path.join(self.source_path, '淘宝推广报表', '超级直播报表')
|
967
|
+
bib(t_path, _as_month=True)
|
968
|
+
|
969
|
+
# @try_except
|
970
|
+
def new_unzip(self, path=None, is_move=None):
|
971
|
+
"""
|
972
|
+
{解压并移除zip文件}
|
973
|
+
如果是京东的商品明细,处理过程:
|
974
|
+
1. 读取 zip包的文件名
|
975
|
+
2. 组合完整路径,判断文件夹下是否已经有同名文件
|
976
|
+
3. 如果有,则将该同名文件改名,(从文件名中提取日期,重新拼接文件名)
|
977
|
+
4. 然后解压 zip包
|
978
|
+
5. 需要用 _jd_rename 继续重命名刚解压的文件
|
979
|
+
is_move 参数, 是否移除 下载目录的所有zip 文件
|
980
|
+
"""
|
981
|
+
if not path:
|
982
|
+
path = self.path
|
983
|
+
res_names = [] # 需要移除的压缩文件
|
984
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
985
|
+
for name in files:
|
986
|
+
if '~$' in name or 'DS_Store' in name:
|
987
|
+
continue
|
988
|
+
if name.endswith('.zip'):
|
989
|
+
old_file = os.path.join(root, name)
|
990
|
+
f = zipfile.ZipFile(old_file, 'r')
|
991
|
+
if len(f.namelist()) == 1: # 压缩包只有一个文件的情况
|
992
|
+
for zip_name in f.namelist(): # 读取zip内的文件名称
|
993
|
+
# zip_name_1 = zip_name.encode('cp437').decode('utf-8')
|
994
|
+
try:
|
995
|
+
zip_name_1 = zip_name.encode('utf-8').decode('utf-8')
|
996
|
+
except:
|
997
|
+
zip_name_1 = zip_name.encode('cp437').decode('utf-8')
|
998
|
+
new_path = os.path.join(root, zip_name_1) # 拼接解压后的文件路径
|
999
|
+
if os.path.isfile(new_path) and '全部渠道_商品明细' in new_path: # 是否存在和包内同名的文件
|
1000
|
+
# 专门处理京东文件, 已过期可删
|
1001
|
+
df = pd.read_excel(new_path)
|
1002
|
+
try:
|
1003
|
+
pattern1 = re.findall(r'\d{8}_(\d{4})(\d{2})(\d{2})_全部渠道_商品明细',
|
1004
|
+
name)
|
1005
|
+
pattern2 = re.findall(
|
1006
|
+
r'\d{8}_(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})_全部渠道_商品明细',
|
1007
|
+
name)
|
1008
|
+
if pattern1:
|
1009
|
+
year_date = '-'.join(list(pattern1[0])) + '_' + '-'.join(list(pattern1[0]))
|
1010
|
+
elif pattern2:
|
1011
|
+
year_date = '-'.join(list(pattern2[0])[0:3]) + '_' + '-'.join(
|
1012
|
+
list(pattern2[0])[3:7])
|
1013
|
+
else:
|
1014
|
+
year_date = '无法提取日期'
|
1015
|
+
print(f'{name} 无法从文件名中提取日期,请检查pattern或文件')
|
1016
|
+
if ('10035975359247' in df['商品ID'].values or '10056642622343' in
|
1017
|
+
df['商品ID'].values):
|
1018
|
+
os.rename(new_path,
|
1019
|
+
os.path.join(root, 'sku_' + year_date + '_全部渠道_商品明细.xls'))
|
1020
|
+
f.extract(zip_name_1, root)
|
1021
|
+
elif ('10021440233518' in df['商品ID'].values or '10022867813485' in
|
1022
|
+
df['商品ID'].values):
|
1023
|
+
os.rename(new_path,
|
1024
|
+
os.path.join(root, 'spu_' + year_date + '_全部渠道_商品明细.xls'))
|
1025
|
+
f.extract(zip_name_1, root)
|
1026
|
+
if is_move:
|
1027
|
+
os.remove(os.path.join(root, name))
|
1028
|
+
except Exception as e:
|
1029
|
+
print(e)
|
1030
|
+
continue
|
1031
|
+
else:
|
1032
|
+
f.extract(zip_name, root)
|
1033
|
+
if zip_name_1 != zip_name:
|
1034
|
+
os.rename(os.path.join(root, zip_name), os.path.join(root, zip_name_1))
|
1035
|
+
if is_move:
|
1036
|
+
res_names.append(name)
|
1037
|
+
# os.remove(os.path.join(root, name)) # 这里不能移除,会提示文件被占用
|
1038
|
+
f.close()
|
1039
|
+
else: # 压缩包内包含多个文件的情况
|
1040
|
+
f.close()
|
1041
|
+
self.unzip_all(path=old_file, save_path=path)
|
1042
|
+
|
1043
|
+
if is_move:
|
1044
|
+
for name in res_names:
|
1045
|
+
os.remove(os.path.join(path, name))
|
1046
|
+
print(f'移除{os.path.join(path, name)}')
|
1047
|
+
|
1048
|
+
@staticmethod
|
1049
|
+
def unzip_all(path, save_path):
|
1050
|
+
"""
|
1051
|
+
遍历目录, 重命名有乱码的文件
|
1052
|
+
2. 如果压缩包是文件夹, 则保存到新文件夹,并删除有乱码的文件夹
|
1053
|
+
3. 删除MAC系统的临时文件夹__MACOSX
|
1054
|
+
"""
|
1055
|
+
with PyZipFile(path) as _f:
|
1056
|
+
_f.extractall(save_path)
|
1057
|
+
_f.close()
|
1058
|
+
for _root, _dirs, _files in os.walk(save_path, topdown=False):
|
1059
|
+
for _name in _files:
|
1060
|
+
if '~$' in _name or 'DS_Store' in _name:
|
1061
|
+
continue
|
1062
|
+
try:
|
1063
|
+
_new_root = _root.encode('cp437').decode('utf-8')
|
1064
|
+
_new_name = _name.encode('cp437').decode('utf-8')
|
1065
|
+
except:
|
1066
|
+
_new_root = _root.encode('utf-8').decode('utf-8')
|
1067
|
+
_new_name = _name.encode('utf-8').decode('utf-8')
|
1068
|
+
_old = os.path.join(_root, _name)
|
1069
|
+
_new = os.path.join(_new_root, _new_name)
|
1070
|
+
if _new_root != _root: # 目录乱码,创建新目录
|
1071
|
+
os.makedirs(_new_root, exist_ok=True)
|
1072
|
+
os.rename(_old, _new)
|
1073
|
+
try:
|
1074
|
+
_new_root = _root.encode('cp437').decode('utf-8')
|
1075
|
+
except:
|
1076
|
+
_new_root = _root.encode('utf-8').decode('utf-8')
|
1077
|
+
if _new_root != _root or '__MACOSX' in _root:
|
1078
|
+
shutil.rmtree(_root)
|
1079
|
+
|
1080
|
+
def upload_df(self, service_databases=None, path=None):
|
1081
|
+
"""
|
1082
|
+
将清洗后的 df 上传数据库, copysh.py 调用
|
1083
|
+
"""
|
1084
|
+
if not service_databases:
|
1085
|
+
service_databases = self.service_databases
|
1086
|
+
df_to_json = df_types.DataTypes() # json 文件, 包含数据的 dtypes 信息
|
1087
|
+
for service_database in service_databases:
|
1088
|
+
for service_name, database in service_database.items():
|
1089
|
+
# print(service_name, database)
|
1090
|
+
if database == 'mongodb':
|
1091
|
+
username, password, host, port = get_myconf.select_config_values(
|
1092
|
+
target_service=service_name,
|
1093
|
+
database=database,
|
1094
|
+
)
|
1095
|
+
d = mongo.UploadMongo(
|
1096
|
+
username=username,
|
1097
|
+
password=password,
|
1098
|
+
host=host,
|
1099
|
+
port=port,
|
1100
|
+
drop_duplicates=False,
|
1101
|
+
)
|
1102
|
+
for data in self.datas:
|
1103
|
+
db_name, collection_name, df = data['数据库名'], data['集合名称'], data['数据主体']
|
1104
|
+
df_to_json.get_df_types(
|
1105
|
+
df=df,
|
1106
|
+
db_name=db_name,
|
1107
|
+
collection_name=collection_name,
|
1108
|
+
is_file_dtype=True, # 默认本地文件优先: True
|
1109
|
+
)
|
1110
|
+
d.df_to_mongo(df=df, db_name=db_name, collection_name=collection_name)
|
1111
|
+
if d.client:
|
1112
|
+
d.client.close()
|
1113
|
+
|
1114
|
+
elif database == 'mysql':
|
1115
|
+
username, password, host, port = get_myconf.select_config_values(
|
1116
|
+
target_service=service_name,
|
1117
|
+
database=database,
|
1118
|
+
)
|
1119
|
+
m = mysql.MysqlUpload(
|
1120
|
+
username=username,
|
1121
|
+
password=password,
|
1122
|
+
host=host,
|
1123
|
+
port=port,
|
1124
|
+
)
|
1125
|
+
for data in self.datas:
|
1126
|
+
df, db_name, collection_name, rt_filename = data['数据主体'], data['数据库名'], data['集合名称'], data['文件名']
|
1127
|
+
df_to_json.get_df_types(
|
1128
|
+
df=df,
|
1129
|
+
db_name=db_name,
|
1130
|
+
collection_name=collection_name,
|
1131
|
+
is_file_dtype=True, # 默认本地文件优先: True
|
1132
|
+
)
|
1133
|
+
m.df_to_mysql(
|
1134
|
+
df=df,
|
1135
|
+
db_name=db_name,
|
1136
|
+
table_name=collection_name,
|
1137
|
+
move_insert=True, # 先删除,再插入
|
1138
|
+
df_sql=False, # 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重
|
1139
|
+
drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
1140
|
+
filename=rt_filename, # 用来追踪处理进度
|
1141
|
+
service_database=service_database, # 字典
|
1142
|
+
)
|
1143
|
+
df_to_json.as_json_file() # 写入 json 文件, 包含数据的 dtypes 信息
|
1144
|
+
|
1145
|
+
|
1146
|
+
def main(service_databases=None):
|
1147
|
+
# 数据分类
|
1148
|
+
|
1149
|
+
if not service_databases:
|
1150
|
+
service_databases = [
|
1151
|
+
# {'home_lx': 'mongodb'},
|
1152
|
+
{'home_lx': 'mysql'},
|
1153
|
+
# {'company': 'mysql'},
|
1154
|
+
# {'nas': 'mysql'},
|
1155
|
+
]
|
1156
|
+
|
1157
|
+
c = DataClean(
|
1158
|
+
path=upload_path, # 源文件目录,下载文件夹
|
1159
|
+
source_path=source_path3, # 原始文件保存目录
|
1160
|
+
service_databases=service_databases
|
1161
|
+
)
|
1162
|
+
c.new_unzip(is_move=True) # 解压文件, is_move 解压后是否删除原 zip 压缩文件
|
1163
|
+
c.tg_reports(is_except=['except']) # 推广报表,天猫淘宝共同清洗
|
1164
|
+
c.syj_reports_tm(is_except=['except']) # 天猫生意经
|
1165
|
+
# c.syj_reports_tb(is_except=['except']) # 淘宝生意经,不可以和天猫同时运行
|
1166
|
+
c.jd_reports(is_except=['except']) # 清洗京东报表
|
1167
|
+
c.sp_scene_clean(is_except=['except']) # 商品素材
|
1168
|
+
c.upload_df(service_databases=service_databases) # 上传数据库
|
1169
|
+
|
1170
|
+
c.move_sjy(is_except=['临时文件',]) # 生意经,移到文件到原始文件夹
|
1171
|
+
c.move_jd(is_except=['临时文件', ]) # 京东,移到文件到原始文件夹
|
1172
|
+
c.move_tg_tm(is_except=['临时文件', ]) # 天猫,移到文件到原始文件夹
|
1173
|
+
c.move_tg_tb(is_except=['临时文件', ]) # 淘宝店,移到文件到原始文件夹
|
1174
|
+
|
1175
|
+
# 更新货品年份基准表, 属性设置 2 - 货品年份基准
|
1176
|
+
p = products.Products()
|
1177
|
+
p.to_mysql(service_databases=service_databases)
|
1178
|
+
|
1179
|
+
# 清理所有非聚合数据的库
|
1180
|
+
optimize_data.op_data(
|
1181
|
+
db_name_lists=[
|
1182
|
+
'京东数据2',
|
1183
|
+
'推广数据2',
|
1184
|
+
'市场数据2',
|
1185
|
+
'生意参谋2',
|
1186
|
+
'生意经2',
|
1187
|
+
'属性设置2',
|
1188
|
+
# '聚合数据', # 不在这里清理聚合数据, 还未开始聚合呢
|
1189
|
+
'京东数据3',
|
1190
|
+
'天猫_推广数据3',
|
1191
|
+
'淘宝_推广数据3',
|
1192
|
+
# '市场数据3',
|
1193
|
+
# '生意参谋3',
|
1194
|
+
'天猫_生意经3',
|
1195
|
+
# '淘宝_生意经3',
|
1196
|
+
],
|
1197
|
+
days=100,
|
1198
|
+
is_mongo=True,
|
1199
|
+
is_mysql=True,
|
1200
|
+
)
|
1201
|
+
|
1202
|
+
# 数据聚合
|
1203
|
+
query_data.data_aggregation(service_databases=service_databases, months=3)
|
1204
|
+
time.sleep(60)
|
1205
|
+
|
1206
|
+
# 清理聚合数据, mongodb 中没有聚合数据,所以只需要清理 mysql 即可
|
1207
|
+
optimize_data.op_data(
|
1208
|
+
db_name_lists=['聚合数据'],
|
1209
|
+
days=3650,
|
1210
|
+
service_databases=service_databases,
|
1211
|
+
is_mongo=False,
|
1212
|
+
is_mysql=True,
|
1213
|
+
)
|
1214
|
+
|
1215
|
+
|
1216
|
+
if __name__ == '__main__':
|
1217
|
+
main(
|
1218
|
+
service_databases = [
|
1219
|
+
# {'company': 'mysql'},
|
1220
|
+
{'home_lx': 'mysql'},
|
1221
|
+
# {'home_lx': 'mongodb'},
|
1222
|
+
# {'nas': 'mysql'},
|
1223
|
+
]
|
1224
|
+
)
|
1225
|
+
|
1226
|
+
# c = DataClean(
|
1227
|
+
# path=upload_path, # 源文件目录,下载文件夹
|
1228
|
+
# source_path=source_path3, # 原始文件保存目录
|
1229
|
+
# service_databases=[{'home_lx': 'mysql'},]
|
1230
|
+
# )
|
1231
|
+
# c.sp_scene_clean(is_except=['except']) # 商品素材
|
1232
|
+
# c.move_tg_tm(is_except=['临时文件', ]) # 天猫,移到文件到原始文件夹
|
mdbq/dataframe/converter.py
CHANGED
@@ -44,8 +44,12 @@ class DataFrameConverter(object):
|
|
44
44
|
df.pop(col) # 等待插入的 df 不能包含 id 列,否则可能跟现有 id 主键冲突
|
45
45
|
continue
|
46
46
|
|
47
|
-
|
48
|
-
|
47
|
+
try:
|
48
|
+
# 百分比在某些数据库中不兼容, 转换百分比为小数, # 转百分比的列不能含有中文或特殊字符
|
49
|
+
df[col] = df[col].apply(
|
50
|
+
lambda x: float(float((str(x).rstrip("%"))) / 100) if re.findall(r'^\d+%', str(x)) else x)
|
51
|
+
except Exception as e:
|
52
|
+
print(f'留意错误信息: 位于列 -> {col} -> {e}')
|
49
53
|
|
50
54
|
if col.endswith('占比') or col.endswith('率'):
|
51
55
|
df = df.astype({col: float}, errors='raise')
|
@@ -9,6 +9,7 @@ mdbq/aggregation/query_data.py,sha256=WKe42Xq1Gi-ELuIT0k2jh3X4-R7heb0ub3Mj3yuCRA
|
|
9
9
|
mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
|
10
10
|
mdbq/bdup/bdup.py,sha256=LAV0TgnQpc-LB-YuJthxb0U42_VkPidzQzAagan46lU,4234
|
11
11
|
mdbq/clean/__init__.py,sha256=A1d6x3L27j4NtLgiFV5TANwEkLuaDfPHDQNrPBbNWtU,41
|
12
|
+
mdbq/clean/clean_upload.py,sha256=q_3kjiE0sU6uV13TW9rVuPmbO01itYhkC4gTnz_nZ5o,64455
|
12
13
|
mdbq/clean/data_clean.py,sha256=ucfslhqXVZoH2QaXHSAWDky0GhIvH9f4GeNaHg4SrFE,104790
|
13
14
|
mdbq/company/__init__.py,sha256=qz8F_GsP_pMB5PblgJAUAMjasuZbOEp3qQOCB39E8f0,21
|
14
15
|
mdbq/company/copysh.py,sha256=NvlXCBZBcO2GIT5nLRYYqhOyHWM1-1RE7DHvgbj6jmQ,19723
|
@@ -19,7 +20,7 @@ mdbq/config/products.py,sha256=hN9UMkM6j76HYMulTYdtr3mOhh9QdpvvrLH14a_mbFY,5980
|
|
19
20
|
mdbq/config/set_support.py,sha256=xkZCX6y9Bq1ppBpJAofld4B2YtchA7fl0eT3dx3CrSI,777
|
20
21
|
mdbq/config/update_conf.py,sha256=taL3ZqKgiVWwUrDFuaYhim9a72Hm4BHRhhDscJTziR8,4535
|
21
22
|
mdbq/dataframe/__init__.py,sha256=2HtCN8AdRj53teXDqzysC1h8aPL-mMFy561ESmhehGQ,22
|
22
|
-
mdbq/dataframe/converter.py,sha256=
|
23
|
+
mdbq/dataframe/converter.py,sha256=u7rQbIsgVZWOIybJaknduf7wViBdBkyU8mwUo24mDt0,4304
|
23
24
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
24
25
|
mdbq/log/mylogger.py,sha256=oaT7Bp-Hb9jZt52seP3ISUuxVcI19s4UiqTeouScBO0,3258
|
25
26
|
mdbq/mongo/__init__.py,sha256=SILt7xMtQIQl_m-ik9WLtJSXIVf424iYgCfE_tnQFbw,13
|
@@ -41,7 +42,7 @@ mdbq/req_post/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
|
41
42
|
mdbq/req_post/req_tb.py,sha256=PexWSCPJNM6Tv0ol4lAWIhlOwsAr_frnjtcdSHCFiek,36179
|
42
43
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
43
44
|
mdbq/spider/aikucun.py,sha256=KdihSB3q44jsXUQAldfWRVfCSrEw2MNbM-_BhP_29g4,14448
|
44
|
-
mdbq-2.5.
|
45
|
-
mdbq-2.5.
|
46
|
-
mdbq-2.5.
|
47
|
-
mdbq-2.5.
|
45
|
+
mdbq-2.5.6.dist-info/METADATA,sha256=IIsrPFdnbkCfH4ziUSl_U05g3Cvc9vwyySZjLXN6SVU,243
|
46
|
+
mdbq-2.5.6.dist-info/WHEEL,sha256=cpQTJ5IWu9CdaPViMhC9YzF8gZuS5-vlfoFihTBC86A,91
|
47
|
+
mdbq-2.5.6.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
48
|
+
mdbq-2.5.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|