mdbq 1.9.5__py3-none-any.whl → 1.9.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/aggregation.py +101 -2
- mdbq/clean/data_clean.py +3 -1
- mdbq/company/copysh.py +1 -0
- {mdbq-1.9.5.dist-info → mdbq-1.9.7.dist-info}/METADATA +1 -1
- {mdbq-1.9.5.dist-info → mdbq-1.9.7.dist-info}/RECORD +7 -7
- {mdbq-1.9.5.dist-info → mdbq-1.9.7.dist-info}/WHEEL +0 -0
- {mdbq-1.9.5.dist-info → mdbq-1.9.7.dist-info}/top_level.txt +0 -0
mdbq/aggregation/aggregation.py
CHANGED
@@ -59,6 +59,7 @@ class DatabaseUpdate:
|
|
59
59
|
|
60
60
|
for root, dirs, files in os.walk(self.path, topdown=False):
|
61
61
|
for name in files:
|
62
|
+
check_remove_file = False # 设置这个参数的目的: 避免误删其他文件, 不是本程序数据清洗覆盖的文件不做干预
|
62
63
|
if '~$' in name or '.DS' in name or '.localized' in name or '.ini' in name or '$RECYCLE.BIN' in name or 'Icon' in name:
|
63
64
|
continue
|
64
65
|
db_name = None # 初始化/重置变量,避免进入下一个循环
|
@@ -93,17 +94,21 @@ class DatabaseUpdate:
|
|
93
94
|
ck = df.columns.tolist()
|
94
95
|
if '场景名字' not in ck:
|
95
96
|
print(f'1.2.0 {name} 报表字段缺失, 请选择Pbix数据模板下载')
|
97
|
+
check_remove_file = True
|
96
98
|
continue
|
97
99
|
if len(df) == 0:
|
98
100
|
print(f'1.3.0 {name} 报表是空的, 请重新下载')
|
101
|
+
check_remove_file = True
|
99
102
|
continue
|
100
103
|
cols = df.columns.tolist()
|
101
104
|
if '日期' not in cols:
|
102
105
|
print(f'1.4.0 {name} 报表不包含分日数据, 已跳过')
|
106
|
+
check_remove_file = True
|
103
107
|
continue
|
104
108
|
if '省' in cols:
|
105
109
|
if '市' not in cols:
|
106
110
|
print(f'1.5.0 {name} 请下载市级地域报表,而不是省报表')
|
111
|
+
check_remove_file = True
|
107
112
|
continue
|
108
113
|
# df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
|
109
114
|
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
@@ -114,11 +119,13 @@ class DatabaseUpdate:
|
|
114
119
|
else:
|
115
120
|
db_name = '推广数据2'
|
116
121
|
collection_name = f'{tg_name}'
|
122
|
+
check_remove_file = True
|
117
123
|
if name.endswith('.csv') and '超级直播' in name:
|
118
124
|
# 超级直播
|
119
125
|
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
120
126
|
if len(df) == 0:
|
121
127
|
print(f'{name} 报表数据为空')
|
128
|
+
check_remove_file = True
|
122
129
|
continue
|
123
130
|
pattern = re.findall(r'(.*_)\d{8}_\d{6}', name)
|
124
131
|
if not pattern: # 说明已经转换过
|
@@ -130,27 +137,34 @@ class DatabaseUpdate:
|
|
130
137
|
shop_name = ''
|
131
138
|
# df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
|
132
139
|
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
140
|
+
check_remove_file = True
|
133
141
|
elif name.endswith('.xls') and '短直联投' in name:
|
134
142
|
# 短直联投
|
135
143
|
df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
|
136
144
|
df = pd.concat(df)
|
137
145
|
if len(df) == 0:
|
138
146
|
print(f'{name} 报表数据为空')
|
147
|
+
check_remove_file = True
|
139
148
|
continue
|
140
149
|
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
150
|
+
check_remove_file = True
|
141
151
|
elif name.endswith('.xls') and '视频加速推广' in name:
|
142
152
|
# 超级短视频
|
143
153
|
df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
|
144
154
|
df = pd.concat(df)
|
145
155
|
if len(df) == 0:
|
146
156
|
print(f'{name} 报表数据为空')
|
157
|
+
check_remove_file = True
|
147
158
|
continue
|
148
159
|
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
160
|
+
check_remove_file = True
|
149
161
|
if '人群报表汇总' in name:
|
150
162
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=1, na_filter=False)
|
151
163
|
if len(df) == 0:
|
152
164
|
print(f'{name} 报表数据为空')
|
165
|
+
check_remove_file = True
|
153
166
|
continue
|
167
|
+
check_remove_file = True
|
154
168
|
# ----------------- 推广报表 分割线 -----------------
|
155
169
|
# ----------------- 推广报表 分割线 -----------------
|
156
170
|
date01 = re.findall(r'(\d{4}-\d{2}-\d{2})_\d{4}-\d{2}-\d{2}', str(name))
|
@@ -161,6 +175,7 @@ class DatabaseUpdate:
|
|
161
175
|
df = pd.read_excel(os.path.join(root, name), header=5)
|
162
176
|
if len(df) == 0:
|
163
177
|
print(f'{name} 报表数据为空')
|
178
|
+
check_remove_file = True
|
164
179
|
continue
|
165
180
|
# df.replace(to_replace=['-'], value=0, regex=False, inplace=True)
|
166
181
|
# df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
@@ -186,15 +201,19 @@ class DatabaseUpdate:
|
|
186
201
|
collection_name='店铺来源_月数据_旧版'
|
187
202
|
else:
|
188
203
|
collection_name='店铺来源_日数据_旧版'
|
204
|
+
check_remove_file = True
|
189
205
|
elif name.endswith('.csv') and '客户运营平台_客户列表' in name:
|
190
206
|
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
207
|
+
check_remove_file = True
|
191
208
|
elif name.endswith('.xlsx') and '直播分场次效果' in name:
|
192
209
|
pattern = re.findall(r'(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})', name)
|
193
210
|
if pattern:
|
211
|
+
check_remove_file = True
|
194
212
|
continue
|
195
213
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
196
214
|
if len(df) == 0:
|
197
215
|
print(f'{name} 报表数据为空')
|
216
|
+
check_remove_file = True
|
198
217
|
continue
|
199
218
|
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
200
219
|
df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
@@ -203,6 +222,7 @@ class DatabaseUpdate:
|
|
203
222
|
df['日期'] = df['日期'].apply(
|
204
223
|
lambda x: pd.to_datetime(str(x).split(' ')[0], format='%Y-%m-%d', errors='ignore') if x else x)
|
205
224
|
df.insert(loc=1, column='店铺', value='万里马官方旗舰店')
|
225
|
+
check_remove_file = True
|
206
226
|
|
207
227
|
elif name.endswith('.xls') and '生意参谋' in name and '无线店铺三级流量来源详情' in name:
|
208
228
|
# 店铺来源,手淘搜索,关键词
|
@@ -210,6 +230,7 @@ class DatabaseUpdate:
|
|
210
230
|
df = pd.read_excel(os.path.join(root, name), header=5)
|
211
231
|
if len(df) == 0:
|
212
232
|
print(f'{name} 报表数据为空')
|
233
|
+
check_remove_file = True
|
213
234
|
continue
|
214
235
|
df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
215
236
|
df.insert(loc=0, column='日期', value=pattern[0][1])
|
@@ -221,12 +242,14 @@ class DatabaseUpdate:
|
|
221
242
|
if pattern[0][0] != pattern[0][1]:
|
222
243
|
data_lis = pattern[0][0] + '_' + pattern[0][1]
|
223
244
|
df.insert(loc=1, column='数据周期', value=data_lis)
|
245
|
+
check_remove_file = True
|
224
246
|
|
225
247
|
elif name.endswith('.xls') and '生意参谋' in name and '商品_全部' in name:
|
226
248
|
# 店铺商品排行
|
227
249
|
df = pd.read_excel(os.path.join(root, name), header=4)
|
228
250
|
if len(df) == 0:
|
229
251
|
print(f'{name} 报表数据为空')
|
252
|
+
check_remove_file = True
|
230
253
|
continue
|
231
254
|
# df.replace(to_replace=['-'], value=0, regex=False, inplace=True)
|
232
255
|
# df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
@@ -234,18 +257,22 @@ class DatabaseUpdate:
|
|
234
257
|
if date01[0] != date02[0]:
|
235
258
|
data_lis = date01[0] + '_' + date02[0]
|
236
259
|
df.insert(loc=1, column='数据周期', value=data_lis)
|
260
|
+
check_remove_file = True
|
237
261
|
elif name.endswith('.xls') and '参谋店铺整体日报' in name:
|
238
262
|
# 自助取数,店铺日报
|
239
263
|
df = pd.read_excel(os.path.join(root, name), header=7)
|
240
264
|
if len(df) == 0:
|
241
265
|
print(f'{name} 报表数据为空')
|
266
|
+
check_remove_file = True
|
242
267
|
continue
|
243
268
|
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
269
|
+
check_remove_file = True
|
244
270
|
elif name.endswith('.xls') and '参谋每日流量_自助取数_新版' in name:
|
245
271
|
# 自助取数,每日流量
|
246
272
|
df = pd.read_excel(os.path.join(root, name), header=7)
|
247
273
|
if len(df) == 0:
|
248
274
|
print(f'{name} 报表数据为空')
|
275
|
+
check_remove_file = True
|
249
276
|
continue
|
250
277
|
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
251
278
|
# 2024-2-19 官方更新了推广渠道来源名称,自助取数没有更新,这里强制更改
|
@@ -258,11 +285,13 @@ class DatabaseUpdate:
|
|
258
285
|
else '智能场景' if x == '智能场景(原万相台)'
|
259
286
|
else x
|
260
287
|
)
|
288
|
+
check_remove_file = True
|
261
289
|
elif name.endswith('.xls') and '商品sku' in name:
|
262
290
|
# 自助取数,商品sku
|
263
291
|
df = pd.read_excel(os.path.join(root, name), header=7)
|
264
292
|
if len(df) == 0:
|
265
293
|
print(f'{name} 报表数据为空')
|
294
|
+
check_remove_file = True
|
266
295
|
continue
|
267
296
|
df.rename(columns={
|
268
297
|
'统计日期': '日期',
|
@@ -270,11 +299,13 @@ class DatabaseUpdate:
|
|
270
299
|
'SKU ID': 'sku id',
|
271
300
|
'商品SKU': '商品sku',
|
272
301
|
}, inplace=True)
|
302
|
+
check_remove_file = True
|
273
303
|
elif name.endswith('.xls') and '参谋店铺流量来源(月)' in name:
|
274
304
|
# 自助取数,月店铺流量来源
|
275
305
|
df = pd.read_excel(os.path.join(root, name), header=7)
|
276
306
|
if len(df) == 0:
|
277
307
|
print(f'{name} 报表数据为空')
|
308
|
+
check_remove_file = True
|
278
309
|
continue
|
279
310
|
df.rename(columns={'统计日期': '数据周期'}, inplace=True)
|
280
311
|
# 2024-2-19 官方更新了推广渠道来源名称,自助取数没有更新,这里强制更改
|
@@ -288,47 +319,56 @@ class DatabaseUpdate:
|
|
288
319
|
else x
|
289
320
|
)
|
290
321
|
df['日期'] = df['数据周期'].apply(lambda x: re.findall('(.*) ~', x)[0])
|
322
|
+
check_remove_file = True
|
291
323
|
elif name.endswith('.csv') and 'baobei' in name:
|
292
324
|
# 生意经宝贝指标日数据
|
293
325
|
date = re.findall(r's-(\d{4})(\d{2})(\d{2})\.', str(name))
|
294
326
|
if not date: # 阻止月数据及已转换的表格
|
295
327
|
print(f'{name} 不支持或是已转换的表格')
|
296
328
|
# os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
329
|
+
check_remove_file = True
|
297
330
|
continue
|
298
331
|
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
299
332
|
if len(df) == 0:
|
300
333
|
print(f'{name} 报表数据为空')
|
301
334
|
os.remove(os.path.join(root, name))
|
335
|
+
check_remove_file = True
|
302
336
|
continue
|
303
337
|
if '日期' in df.columns.tolist():
|
304
338
|
df.pop('日期')
|
305
339
|
new_date = '-'.join(date[0])
|
306
340
|
df.insert(loc=0, column='日期', value=new_date)
|
307
341
|
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
342
|
+
check_remove_file = True
|
308
343
|
elif name.endswith('.csv') and '店铺销售指标' in name:
|
309
344
|
# 生意经, 店铺指标,仅限月数据,实际日指标也可以
|
310
345
|
name_st = re.findall(r'(.*)\(分日', name)
|
311
346
|
if not name_st:
|
312
347
|
print(f'{name} 已转换的表格')
|
348
|
+
check_remove_file = True
|
313
349
|
continue
|
314
350
|
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
315
351
|
if len(df) == 0:
|
316
352
|
print(f'{name} 报表数据为空')
|
353
|
+
check_remove_file = True
|
317
354
|
continue
|
318
355
|
df['日期'] = df['日期'].astype(str).apply(
|
319
356
|
lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', x)[0]) if x else x)
|
320
357
|
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
358
|
+
check_remove_file = True
|
321
359
|
elif name.endswith('csv') and '省份城市分析' in name:
|
322
360
|
# 生意经,地域分布, 仅限日数据
|
323
361
|
pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})\.', name)
|
324
362
|
if not pattern or '省份城市分析2' not in name:
|
325
363
|
print(f'{name} 不支持或已转换的表格')
|
326
364
|
# os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
365
|
+
check_remove_file = True
|
327
366
|
continue
|
328
367
|
date = '-'.join(pattern[0][1:])
|
329
368
|
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
330
369
|
if len(df) == 0:
|
331
370
|
print(f'{name} 报表数据为空')
|
371
|
+
check_remove_file = True
|
332
372
|
continue
|
333
373
|
df['省'] = df['省份'].apply(lambda x: x if ' ├─ ' not in x and ' └─ ' not in x else None)
|
334
374
|
df['城市'] = df[['省份', '省']].apply(lambda x: '汇总' if x['省'] else x['省份'], axis=1)
|
@@ -342,12 +382,14 @@ class DatabaseUpdate:
|
|
342
382
|
df['省+市'] = df[['省份', '城市']].apply(lambda x: f'{x["省份"]}-{x["城市"]}', axis=1)
|
343
383
|
df.replace('NAN', 0, inplace=True)
|
344
384
|
df['笔单价'] = df.apply(lambda x: 0 if x['销售量'] == 0 else 0 if x['销售量'] == '0' else x['笔单价'], axis=1)
|
385
|
+
check_remove_file = True
|
345
386
|
elif name.endswith('csv') and 'order' in name:
|
346
387
|
# 生意经,订单数据,仅限月数据
|
347
388
|
pattern = re.findall(r'(.*)(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})', name)
|
348
389
|
if not pattern:
|
349
390
|
print(f'{name} 不支持或已转换的表格')
|
350
391
|
# os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
392
|
+
check_remove_file = True
|
351
393
|
continue
|
352
394
|
date1 = pattern[0][1:4]
|
353
395
|
date1 = '-'.join(date1)
|
@@ -357,6 +399,7 @@ class DatabaseUpdate:
|
|
357
399
|
df = pd.read_csv(os.path.join(root, name), encoding='gb18030', header=0, na_filter=False)
|
358
400
|
if len(df) == 0:
|
359
401
|
print(f'{name} 报表数据为空')
|
402
|
+
check_remove_file = True
|
360
403
|
continue
|
361
404
|
df.insert(loc=0, column='日期', value=date1)
|
362
405
|
df.insert(loc=1, column='数据周期', value=date)
|
@@ -365,30 +408,38 @@ class DatabaseUpdate:
|
|
365
408
|
df.rename(columns={'宝贝标题': '商品标题', '宝贝链接': '商品链接'}, inplace=True)
|
366
409
|
df['颜色编码'] = df['商家编码'].apply(
|
367
410
|
lambda x: ''.join(re.findall(r' .*(\d{4})$', str(x))) if x else x)
|
411
|
+
check_remove_file = True
|
368
412
|
elif name.endswith('.xlsx') and '直播间成交订单明细' in name:
|
369
413
|
# 直播间成交订单明细
|
370
414
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
371
415
|
if len(df) == 0:
|
372
416
|
print(f'{name} 报表数据为空')
|
417
|
+
check_remove_file = True
|
373
418
|
continue
|
374
419
|
df.rename(columns={'场次ID': '场次id', '商品ID': '商品id'}, inplace=True)
|
375
420
|
df['日期'] = df['支付时间'].apply(lambda x: x.strftime('%Y-%m-%d'))
|
421
|
+
check_remove_file = True
|
376
422
|
elif name.endswith('.xlsx') and '直播间大盘数据' in name:
|
377
423
|
# 直播间大盘数据
|
378
424
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
379
425
|
if len(df) == 0:
|
380
426
|
print(f'{name} 报表数据为空')
|
427
|
+
check_remove_file = True
|
381
428
|
continue
|
382
429
|
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
430
|
+
check_remove_file = True
|
383
431
|
elif name.endswith('.xls') and '直播业绩-成交拆解' in name:
|
384
432
|
# 直播业绩-成交拆解
|
385
433
|
df = pd.read_excel(os.path.join(root, name), header=5)
|
386
434
|
if len(df) == 0:
|
387
435
|
print(f'{name} 报表数据为空')
|
436
|
+
check_remove_file = True
|
388
437
|
continue
|
389
438
|
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
439
|
+
check_remove_file = True
|
390
440
|
elif name.endswith('.csv') and '淘宝店铺数据' in name:
|
391
441
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
442
|
+
check_remove_file = True
|
392
443
|
elif name.endswith('.csv') and '人群洞察' in name:
|
393
444
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
394
445
|
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
@@ -397,37 +448,50 @@ class DatabaseUpdate:
|
|
397
448
|
if is_move:
|
398
449
|
try:
|
399
450
|
os.remove(os.path.join(root, name)) # 是否移除原文件
|
451
|
+
check_remove_file = True
|
400
452
|
except Exception as e:
|
401
453
|
print(f'{name}, {e}')
|
402
454
|
continue
|
403
455
|
elif name.endswith('.csv') and '客户_客户概况_画像' in name:
|
404
456
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
457
|
+
check_remove_file = True
|
405
458
|
elif name.endswith('.csv') and '市场排行_店铺' in name:
|
406
459
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
460
|
+
check_remove_file = True
|
407
461
|
elif name.endswith('.csv') and '类目洞察_属性分析_分析明细_商品发现' in name:
|
408
462
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
463
|
+
check_remove_file = True
|
409
464
|
elif name.endswith('.csv') and '类目洞察_属性分析_分析明细_汇总' in name:
|
410
465
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
466
|
+
check_remove_file = True
|
411
467
|
elif name.endswith('.csv') and '类目洞察_价格分析_分析明细_商品发现' in name:
|
412
468
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
469
|
+
check_remove_file = True
|
413
470
|
elif name.endswith('.csv') and '类目洞察_价格分析_分析明细_汇总' in name:
|
414
471
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
472
|
+
check_remove_file = True
|
415
473
|
elif name.endswith('.csv') and '搜索排行_搜索' in name:
|
416
474
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
475
|
+
check_remove_file = True
|
417
476
|
elif name.endswith('.csv') and '竞店分析-销售分析-关键指标对比' in name:
|
418
477
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
478
|
+
check_remove_file = True
|
419
479
|
elif name.endswith('.csv') and '竞店分析-销售分析-top商品榜' in name:
|
420
480
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
481
|
+
check_remove_file = True
|
421
482
|
elif name.endswith('.csv') and '竞店分析-来源分析-入店来源' in name:
|
422
483
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
484
|
+
check_remove_file = True
|
423
485
|
elif name.endswith('.csv') and '竞店分析-来源分析-入店搜索词' in name:
|
424
486
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
487
|
+
check_remove_file = True
|
425
488
|
# ----------------------- 京东数据处理分界线 -----------------------
|
426
489
|
# ----------------------- 京东数据处理分界线 -----------------------
|
427
490
|
elif name.endswith('.xlsx') and '店铺来源_流量来源' in name:
|
428
491
|
# 京东店铺来源
|
429
492
|
if '按天' not in name:
|
430
493
|
print(f'{name} 京东流量请按天下载')
|
494
|
+
check_remove_file = True
|
431
495
|
continue
|
432
496
|
date01 = re.findall(r'(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
|
433
497
|
new_date01 = f'{date01[0][0]}-{date01[0][1]}-{date01[0][2]}'
|
@@ -436,6 +500,7 @@ class DatabaseUpdate:
|
|
436
500
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
437
501
|
if len(df) == 0:
|
438
502
|
print(f'{name} 报表数据为空')
|
503
|
+
check_remove_file = True
|
439
504
|
continue
|
440
505
|
df.insert(loc=0, column='日期', value=new_date01)
|
441
506
|
if new_date01 != new_date02:
|
@@ -444,17 +509,20 @@ class DatabaseUpdate:
|
|
444
509
|
for col_2024 in cols: # 京东这个表有字段加了去年日期,删除这些同比数据字段,不然列数量爆炸
|
445
510
|
if '20' in col_2024 and '流量来源' in name:
|
446
511
|
df.drop(col_2024, axis=1, inplace=True)
|
512
|
+
check_remove_file = True
|
447
513
|
elif name.endswith('.xlsx') and '全部渠道_商品明细' in name:
|
448
514
|
# 京东商品明细 文件转换
|
449
515
|
date1 = re.findall(r'_(\d{4})(\d{2})(\d{2})_全部', str(name))
|
450
516
|
if not date1[0]:
|
451
517
|
print(f'{name}: 仅支持日数据')
|
518
|
+
check_remove_file = True
|
452
519
|
continue
|
453
520
|
if date1:
|
454
521
|
date1 = f'{date1[0][0]}-{date1[0][1]}-{date1[0][2]}'
|
455
522
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
456
523
|
if len(df) == 0:
|
457
524
|
print(f'{name} 报表数据为空')
|
525
|
+
check_remove_file = True
|
458
526
|
continue
|
459
527
|
if '10035975359247' in df['商品ID'].values or '10056642622343' in df['商品ID'].values:
|
460
528
|
new_name = f'sku_{date1}_全部渠道_商品明细.csv'
|
@@ -471,30 +539,37 @@ class DatabaseUpdate:
|
|
471
539
|
elif 'spu' in new_name:
|
472
540
|
db_name = '京东数据2'
|
473
541
|
collection_name = 'spu_商品明细'
|
542
|
+
check_remove_file = True
|
474
543
|
elif name.endswith('.xlsx') and '搜索分析-排名定位-商品词下排名' in name:
|
475
544
|
# 京东商品词下排名
|
476
545
|
try:
|
477
546
|
pattern = re.findall(r'(\d{4}-\d{2}-\d{2})-(\d{4}-\d{2}-\d{2})', name)
|
478
547
|
if not pattern:
|
548
|
+
check_remove_file = True
|
479
549
|
continue
|
480
550
|
if pattern[0][0] == pattern[0][1]:
|
481
551
|
print(f'{name}: 检测到数据周期异常,仅支持7天数据')
|
552
|
+
check_remove_file = True
|
482
553
|
continue
|
483
554
|
df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
|
484
555
|
if len(df) == 0:
|
485
556
|
print(f'{name} 报表数据为空')
|
557
|
+
check_remove_file = True
|
486
558
|
continue
|
487
559
|
if len(df.columns.tolist()) < 20:
|
488
560
|
print(f'{name}: 报表可能缺失诊断数据')
|
489
561
|
os.remove(os.path.join(root, name))
|
562
|
+
check_remove_file = True
|
490
563
|
continue
|
491
564
|
df.rename(columns={'商品的ID': 'skuid'}, inplace=True)
|
492
565
|
for col in ['词人气', '搜索点击率']:
|
493
566
|
if col in df.columns.tolist():
|
494
567
|
df[col] = df[col].apply(lambda x: round(x, 6) if x else x)
|
568
|
+
check_remove_file = True
|
495
569
|
except Exception as e:
|
496
570
|
print(e)
|
497
571
|
print(name, '报错')
|
572
|
+
check_remove_file = True
|
498
573
|
continue
|
499
574
|
elif name.endswith('.xlsx') and '搜索分析-排名定位-商品排名' in name:
|
500
575
|
# 京东商品排名
|
@@ -502,11 +577,13 @@ class DatabaseUpdate:
|
|
502
577
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
503
578
|
if len(df) == 0:
|
504
579
|
print(f'{name} 报表数据为空')
|
580
|
+
check_remove_file = True
|
505
581
|
continue
|
506
582
|
df.insert(0, '日期', date_in) # 插入新列
|
507
583
|
df.rename(columns={'SKU': 'skuid'}, inplace=True)
|
508
584
|
if '点击率' in df.columns.tolist():
|
509
585
|
df['点击率'] = df['点击率'].apply(lambda x: round(x, 6) if x else x)
|
586
|
+
check_remove_file = True
|
510
587
|
elif name.endswith('.xls') and '竞店概况_竞店详情' in name:
|
511
588
|
# 京东,竞争-竞店概况-竞店详情-全部渠道
|
512
589
|
date01 = re.findall(r'全部渠道_(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
|
@@ -515,68 +592,87 @@ class DatabaseUpdate:
|
|
515
592
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
516
593
|
if len(df) == 0:
|
517
594
|
print(f'{name} 报表数据为空')
|
595
|
+
check_remove_file = True
|
518
596
|
continue
|
519
597
|
df.insert(loc=0, column='日期', value=start_date)
|
520
|
-
|
598
|
+
check_remove_file = True
|
599
|
+
elif name.endswith('.xls') and ('JD店铺日报_店铺' in name or '店铺_20' in name):
|
521
600
|
# 京东 自助报表 店铺日报
|
522
601
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
523
602
|
if len(df) == 0:
|
524
603
|
print(f'{name} 报表数据为空')
|
604
|
+
check_remove_file = True
|
605
|
+
continue
|
606
|
+
if '访客数-全部渠道' not in df.columns.tolist(): # 识别是否真的京东日报
|
525
607
|
continue
|
526
608
|
df['日期'] = df['日期'].apply(
|
527
609
|
lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', str(x))[0])
|
528
610
|
)
|
611
|
+
check_remove_file = True
|
529
612
|
elif name.endswith('.xls') and '商家榜单_女包_整体' in name:
|
530
613
|
# 京东 行业 商家榜单
|
531
614
|
date2 = re.findall(r'_\d{8}-\d+', name)
|
532
615
|
if date2:
|
533
616
|
print(f'{name}: 请下载日数据,不支持其他周期')
|
534
617
|
# os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
618
|
+
check_remove_file = True
|
535
619
|
continue
|
536
620
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
537
621
|
if len(df) == 0:
|
538
622
|
print(f'{name} 报表数据为空')
|
623
|
+
check_remove_file = True
|
539
624
|
continue
|
540
625
|
df['日期'] = df['日期'].astype(str).apply(lambda x: f'{x[:4]}-{x[4:6]}-{x[6:8]}')
|
541
626
|
df.insert(loc=0, column='类型', value='商家榜单')
|
627
|
+
check_remove_file = True
|
542
628
|
elif name.endswith('.xlsx') and '批量SKU导出-批量任务' in name:
|
543
629
|
# 京东 sku 导出
|
544
630
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
545
631
|
if len(df) == 0:
|
546
632
|
print(f'{name} 报表数据为空')
|
633
|
+
check_remove_file = True
|
547
634
|
continue
|
548
635
|
d_time = datetime.datetime.today().strftime('%Y-%m-%d')
|
549
636
|
df.insert(loc=0, column='日期', value=d_time)
|
550
637
|
df['商品链接'] = df['商品链接'].apply(lambda x: f'https://{x}' if x else x)
|
638
|
+
check_remove_file = True
|
551
639
|
elif name.endswith('.xlsx') and '批量SPU导出-批量任务' in name:
|
552
640
|
# 京东 spu 导出
|
553
641
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
554
642
|
if len(df) == 0:
|
555
643
|
print(f'{name} 报表数据为空')
|
644
|
+
check_remove_file = True
|
556
645
|
continue
|
557
646
|
d_time = datetime.datetime.today().strftime('%Y-%m-%d')
|
558
647
|
df.insert(loc=0, column='日期', value=d_time)
|
648
|
+
check_remove_file = True
|
559
649
|
elif name.endswith('.csv') and '万里马箱包推广1_完整点击成交' in name:
|
560
650
|
# 京东推广数据
|
561
651
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
562
652
|
if len(df) == 0:
|
563
653
|
print(f'{name} 报表数据为空')
|
654
|
+
check_remove_file = True
|
564
655
|
continue
|
565
656
|
df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
|
657
|
+
check_remove_file = True
|
566
658
|
elif name.endswith('.csv') and '万里马箱包推广1_京东推广搜索词_pbix同步不要' in name:
|
567
659
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
568
660
|
if len(df) == 0:
|
569
661
|
print(f'{name} 报表数据为空')
|
662
|
+
check_remove_file = True
|
570
663
|
continue
|
571
664
|
df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
|
572
665
|
df['是否品牌词'] = df['搜索词'].str.contains('万里马|wanlima', regex=True)
|
573
666
|
df['是否品牌词'] = df['是否品牌词'].apply(lambda x: '品牌词' if x else '')
|
667
|
+
check_remove_file = True
|
574
668
|
elif name.endswith('.xlsx') and '零售明细统计' in name:
|
575
669
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
576
670
|
if len(df) == 0:
|
577
671
|
print(f'{name} 报表数据为空')
|
672
|
+
check_remove_file = True
|
578
673
|
continue
|
579
674
|
df = df[df['缩略图'] != '合计']
|
675
|
+
check_remove_file = True
|
580
676
|
elif name.endswith('.csv') and '营销概况_全站营销' in name:
|
581
677
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=1, na_filter=False)
|
582
678
|
df = df[(df['日期'] != '日期') & (df['日期'] != '汇总') & (df['日期'] != '0') & (df['花费'] != '0') & (df['花费'] != '0.00')]
|
@@ -584,6 +680,7 @@ class DatabaseUpdate:
|
|
584
680
|
df.drop("'当前时间'", axis=1, inplace=True)
|
585
681
|
df.rename(columns={'全站ROI': '全站roi'}, inplace=True)
|
586
682
|
df.insert(loc=1, column='产品线', value='全站营销')
|
683
|
+
check_remove_file = True
|
587
684
|
elif name.endswith('.csv') and '关键词点击成交报表_pbix同步_勿删改' in name:
|
588
685
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
589
686
|
for col in df.columns.tolist():
|
@@ -595,6 +692,7 @@ class DatabaseUpdate:
|
|
595
692
|
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
|
596
693
|
# min_clm = str(df['日期'].min()).split(' ')[0]
|
597
694
|
# max_clm = str(df['日期'].max()).split(' ')[0]
|
695
|
+
check_remove_file = True
|
598
696
|
|
599
697
|
# 商品素材,必须保持放在最后处理
|
600
698
|
elif name.endswith('xlsx'):
|
@@ -619,8 +717,9 @@ class DatabaseUpdate:
|
|
619
717
|
collection_name = '商品素材导出'
|
620
718
|
else:
|
621
719
|
df = pd.DataFrame()
|
720
|
+
check_remove_file = True
|
622
721
|
|
623
|
-
if is_move:
|
722
|
+
if is_move and check_remove_file:
|
624
723
|
try:
|
625
724
|
os.remove(os.path.join(root, name)) # 是否移除原文件
|
626
725
|
except Exception as e:
|
mdbq/clean/data_clean.py
CHANGED
@@ -895,13 +895,15 @@ class DataClean:
|
|
895
895
|
m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_竞店监控_日数据')
|
896
896
|
os.remove(os.path.join(root, name))
|
897
897
|
|
898
|
-
elif name.endswith('.xls') and '店铺' in name:
|
898
|
+
elif name.endswith('.xls') and ('JD店铺日报_店铺' in name or '店铺_20' in name):
|
899
899
|
# 京东 自助报表 店铺日报
|
900
900
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
901
901
|
if len(df) == 0:
|
902
902
|
print(f'{name} 报表数据为空')
|
903
903
|
os.remove(os.path.join(root, name))
|
904
904
|
continue
|
905
|
+
if '访客数-全部渠道' not in df.columns.tolist(): # 识别是否真的京东日报
|
906
|
+
continue
|
905
907
|
df['日期'] = df['日期'].apply(
|
906
908
|
lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', str(x))[0])
|
907
909
|
)
|
mdbq/company/copysh.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
2
|
mdbq/__version__.py,sha256=y9Mp_8x0BCZSHsdLT_q5tX9wZwd5QgqrSIENLrb6vXA,62
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/aggregation.py,sha256=
|
4
|
+
mdbq/aggregation/aggregation.py,sha256=QAN378cXlkwonHUDSBYfdZRfHBuqft_HR7Vfr8l87-k,72085
|
5
5
|
mdbq/aggregation/df_types.py,sha256=oQJS2IBU3_IO6GMgbssHuC2yCjNnbta0QPGrFOwNLnU,7591
|
6
6
|
mdbq/aggregation/mysql_types.py,sha256=DQYROALDiwjJzjhaJfIIdnsrNs11i5BORlj_v6bp67Y,11062
|
7
7
|
mdbq/aggregation/optimize_data.py,sha256=u2Kl_MFtZueXJ57ycy4H2OhXD431RctUYJYCl637uT0,4176
|
@@ -9,9 +9,9 @@ mdbq/aggregation/query_data.py,sha256=32NjVVYLnfFkzD8TflmNVhpdQTLRRUrb9toMGApSOC
|
|
9
9
|
mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
|
10
10
|
mdbq/bdup/bdup.py,sha256=LAV0TgnQpc-LB-YuJthxb0U42_VkPidzQzAagan46lU,4234
|
11
11
|
mdbq/clean/__init__.py,sha256=A1d6x3L27j4NtLgiFV5TANwEkLuaDfPHDQNrPBbNWtU,41
|
12
|
-
mdbq/clean/data_clean.py,sha256=
|
12
|
+
mdbq/clean/data_clean.py,sha256=y83uqOyM6nL0d3ClUqYMjE23ghBEkhz9uv19qrxA8NA,100980
|
13
13
|
mdbq/company/__init__.py,sha256=qz8F_GsP_pMB5PblgJAUAMjasuZbOEp3qQOCB39E8f0,21
|
14
|
-
mdbq/company/copysh.py,sha256=
|
14
|
+
mdbq/company/copysh.py,sha256=VUaaJPXPYPHWwnkdK77PWz_dAXZyEmYBA9Df1yROHAc,17764
|
15
15
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
16
16
|
mdbq/config/get_myconf.py,sha256=-CFEW0dQh4OIwVgwK-cL0eVp1LN3PjJgN89d4P5TB9I,6011
|
17
17
|
mdbq/config/products.py,sha256=vIK8DJ-F3XXwvNPK-4OJq2tZITNlL6Sub8QBdoOng8U,5676
|
@@ -36,7 +36,7 @@ mdbq/pbix/__init__.py,sha256=Trtfaynu9RjoTyLLYBN2xdRxTvm_zhCniUkVTAYwcjo,24
|
|
36
36
|
mdbq/pbix/pbix_refresh.py,sha256=JUjKW3bNEyoMVfVfo77UhguvS5AWkixvVhDbw4_MHco,2396
|
37
37
|
mdbq/pbix/refresh_all.py,sha256=0uAnBKCd5cx5FLTkawN1GV9yi87rfyMgYal5LABtumQ,7186
|
38
38
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
39
|
-
mdbq-1.9.
|
40
|
-
mdbq-1.9.
|
41
|
-
mdbq-1.9.
|
42
|
-
mdbq-1.9.
|
39
|
+
mdbq-1.9.7.dist-info/METADATA,sha256=rqBOduo-xKxLyXbxt83RXob4dVYqlhdql_WL06TysmY,245
|
40
|
+
mdbq-1.9.7.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
41
|
+
mdbq-1.9.7.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
42
|
+
mdbq-1.9.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|