mdbq 1.9.4__py3-none-any.whl → 1.9.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/aggregation.py +98 -1
- mdbq/aggregation/query_data.py +2 -2
- {mdbq-1.9.4.dist-info → mdbq-1.9.6.dist-info}/METADATA +1 -1
- {mdbq-1.9.4.dist-info → mdbq-1.9.6.dist-info}/RECORD +6 -6
- {mdbq-1.9.4.dist-info → mdbq-1.9.6.dist-info}/WHEEL +0 -0
- {mdbq-1.9.4.dist-info → mdbq-1.9.6.dist-info}/top_level.txt +0 -0
mdbq/aggregation/aggregation.py
CHANGED
@@ -59,6 +59,7 @@ class DatabaseUpdate:
|
|
59
59
|
|
60
60
|
for root, dirs, files in os.walk(self.path, topdown=False):
|
61
61
|
for name in files:
|
62
|
+
check_remove_file = False # 设置这个参数的目的: 避免误删其他文件, 不是本程序数据清洗覆盖的文件不做干预
|
62
63
|
if '~$' in name or '.DS' in name or '.localized' in name or '.ini' in name or '$RECYCLE.BIN' in name or 'Icon' in name:
|
63
64
|
continue
|
64
65
|
db_name = None # 初始化/重置变量,避免进入下一个循环
|
@@ -93,17 +94,21 @@ class DatabaseUpdate:
|
|
93
94
|
ck = df.columns.tolist()
|
94
95
|
if '场景名字' not in ck:
|
95
96
|
print(f'1.2.0 {name} 报表字段缺失, 请选择Pbix数据模板下载')
|
97
|
+
check_remove_file = True
|
96
98
|
continue
|
97
99
|
if len(df) == 0:
|
98
100
|
print(f'1.3.0 {name} 报表是空的, 请重新下载')
|
101
|
+
check_remove_file = True
|
99
102
|
continue
|
100
103
|
cols = df.columns.tolist()
|
101
104
|
if '日期' not in cols:
|
102
105
|
print(f'1.4.0 {name} 报表不包含分日数据, 已跳过')
|
106
|
+
check_remove_file = True
|
103
107
|
continue
|
104
108
|
if '省' in cols:
|
105
109
|
if '市' not in cols:
|
106
110
|
print(f'1.5.0 {name} 请下载市级地域报表,而不是省报表')
|
111
|
+
check_remove_file = True
|
107
112
|
continue
|
108
113
|
# df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
|
109
114
|
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
@@ -114,11 +119,13 @@ class DatabaseUpdate:
|
|
114
119
|
else:
|
115
120
|
db_name = '推广数据2'
|
116
121
|
collection_name = f'{tg_name}'
|
122
|
+
check_remove_file = True
|
117
123
|
if name.endswith('.csv') and '超级直播' in name:
|
118
124
|
# 超级直播
|
119
125
|
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
120
126
|
if len(df) == 0:
|
121
127
|
print(f'{name} 报表数据为空')
|
128
|
+
check_remove_file = True
|
122
129
|
continue
|
123
130
|
pattern = re.findall(r'(.*_)\d{8}_\d{6}', name)
|
124
131
|
if not pattern: # 说明已经转换过
|
@@ -130,27 +137,34 @@ class DatabaseUpdate:
|
|
130
137
|
shop_name = ''
|
131
138
|
# df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
|
132
139
|
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
140
|
+
check_remove_file = True
|
133
141
|
elif name.endswith('.xls') and '短直联投' in name:
|
134
142
|
# 短直联投
|
135
143
|
df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
|
136
144
|
df = pd.concat(df)
|
137
145
|
if len(df) == 0:
|
138
146
|
print(f'{name} 报表数据为空')
|
147
|
+
check_remove_file = True
|
139
148
|
continue
|
140
149
|
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
150
|
+
check_remove_file = True
|
141
151
|
elif name.endswith('.xls') and '视频加速推广' in name:
|
142
152
|
# 超级短视频
|
143
153
|
df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
|
144
154
|
df = pd.concat(df)
|
145
155
|
if len(df) == 0:
|
146
156
|
print(f'{name} 报表数据为空')
|
157
|
+
check_remove_file = True
|
147
158
|
continue
|
148
159
|
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
160
|
+
check_remove_file = True
|
149
161
|
if '人群报表汇总' in name:
|
150
162
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=1, na_filter=False)
|
151
163
|
if len(df) == 0:
|
152
164
|
print(f'{name} 报表数据为空')
|
165
|
+
check_remove_file = True
|
153
166
|
continue
|
167
|
+
check_remove_file = True
|
154
168
|
# ----------------- 推广报表 分割线 -----------------
|
155
169
|
# ----------------- 推广报表 分割线 -----------------
|
156
170
|
date01 = re.findall(r'(\d{4}-\d{2}-\d{2})_\d{4}-\d{2}-\d{2}', str(name))
|
@@ -161,6 +175,7 @@ class DatabaseUpdate:
|
|
161
175
|
df = pd.read_excel(os.path.join(root, name), header=5)
|
162
176
|
if len(df) == 0:
|
163
177
|
print(f'{name} 报表数据为空')
|
178
|
+
check_remove_file = True
|
164
179
|
continue
|
165
180
|
# df.replace(to_replace=['-'], value=0, regex=False, inplace=True)
|
166
181
|
# df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
@@ -186,15 +201,19 @@ class DatabaseUpdate:
|
|
186
201
|
collection_name='店铺来源_月数据_旧版'
|
187
202
|
else:
|
188
203
|
collection_name='店铺来源_日数据_旧版'
|
204
|
+
check_remove_file = True
|
189
205
|
elif name.endswith('.csv') and '客户运营平台_客户列表' in name:
|
190
206
|
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
207
|
+
check_remove_file = True
|
191
208
|
elif name.endswith('.xlsx') and '直播分场次效果' in name:
|
192
209
|
pattern = re.findall(r'(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})', name)
|
193
210
|
if pattern:
|
211
|
+
check_remove_file = True
|
194
212
|
continue
|
195
213
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
196
214
|
if len(df) == 0:
|
197
215
|
print(f'{name} 报表数据为空')
|
216
|
+
check_remove_file = True
|
198
217
|
continue
|
199
218
|
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
200
219
|
df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
@@ -203,6 +222,7 @@ class DatabaseUpdate:
|
|
203
222
|
df['日期'] = df['日期'].apply(
|
204
223
|
lambda x: pd.to_datetime(str(x).split(' ')[0], format='%Y-%m-%d', errors='ignore') if x else x)
|
205
224
|
df.insert(loc=1, column='店铺', value='万里马官方旗舰店')
|
225
|
+
check_remove_file = True
|
206
226
|
|
207
227
|
elif name.endswith('.xls') and '生意参谋' in name and '无线店铺三级流量来源详情' in name:
|
208
228
|
# 店铺来源,手淘搜索,关键词
|
@@ -210,6 +230,7 @@ class DatabaseUpdate:
|
|
210
230
|
df = pd.read_excel(os.path.join(root, name), header=5)
|
211
231
|
if len(df) == 0:
|
212
232
|
print(f'{name} 报表数据为空')
|
233
|
+
check_remove_file = True
|
213
234
|
continue
|
214
235
|
df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
215
236
|
df.insert(loc=0, column='日期', value=pattern[0][1])
|
@@ -221,12 +242,14 @@ class DatabaseUpdate:
|
|
221
242
|
if pattern[0][0] != pattern[0][1]:
|
222
243
|
data_lis = pattern[0][0] + '_' + pattern[0][1]
|
223
244
|
df.insert(loc=1, column='数据周期', value=data_lis)
|
245
|
+
check_remove_file = True
|
224
246
|
|
225
247
|
elif name.endswith('.xls') and '生意参谋' in name and '商品_全部' in name:
|
226
248
|
# 店铺商品排行
|
227
249
|
df = pd.read_excel(os.path.join(root, name), header=4)
|
228
250
|
if len(df) == 0:
|
229
251
|
print(f'{name} 报表数据为空')
|
252
|
+
check_remove_file = True
|
230
253
|
continue
|
231
254
|
# df.replace(to_replace=['-'], value=0, regex=False, inplace=True)
|
232
255
|
# df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
@@ -234,18 +257,22 @@ class DatabaseUpdate:
|
|
234
257
|
if date01[0] != date02[0]:
|
235
258
|
data_lis = date01[0] + '_' + date02[0]
|
236
259
|
df.insert(loc=1, column='数据周期', value=data_lis)
|
260
|
+
check_remove_file = True
|
237
261
|
elif name.endswith('.xls') and '参谋店铺整体日报' in name:
|
238
262
|
# 自助取数,店铺日报
|
239
263
|
df = pd.read_excel(os.path.join(root, name), header=7)
|
240
264
|
if len(df) == 0:
|
241
265
|
print(f'{name} 报表数据为空')
|
266
|
+
check_remove_file = True
|
242
267
|
continue
|
243
268
|
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
269
|
+
check_remove_file = True
|
244
270
|
elif name.endswith('.xls') and '参谋每日流量_自助取数_新版' in name:
|
245
271
|
# 自助取数,每日流量
|
246
272
|
df = pd.read_excel(os.path.join(root, name), header=7)
|
247
273
|
if len(df) == 0:
|
248
274
|
print(f'{name} 报表数据为空')
|
275
|
+
check_remove_file = True
|
249
276
|
continue
|
250
277
|
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
251
278
|
# 2024-2-19 官方更新了推广渠道来源名称,自助取数没有更新,这里强制更改
|
@@ -258,11 +285,13 @@ class DatabaseUpdate:
|
|
258
285
|
else '智能场景' if x == '智能场景(原万相台)'
|
259
286
|
else x
|
260
287
|
)
|
288
|
+
check_remove_file = True
|
261
289
|
elif name.endswith('.xls') and '商品sku' in name:
|
262
290
|
# 自助取数,商品sku
|
263
291
|
df = pd.read_excel(os.path.join(root, name), header=7)
|
264
292
|
if len(df) == 0:
|
265
293
|
print(f'{name} 报表数据为空')
|
294
|
+
check_remove_file = True
|
266
295
|
continue
|
267
296
|
df.rename(columns={
|
268
297
|
'统计日期': '日期',
|
@@ -270,11 +299,13 @@ class DatabaseUpdate:
|
|
270
299
|
'SKU ID': 'sku id',
|
271
300
|
'商品SKU': '商品sku',
|
272
301
|
}, inplace=True)
|
302
|
+
check_remove_file = True
|
273
303
|
elif name.endswith('.xls') and '参谋店铺流量来源(月)' in name:
|
274
304
|
# 自助取数,月店铺流量来源
|
275
305
|
df = pd.read_excel(os.path.join(root, name), header=7)
|
276
306
|
if len(df) == 0:
|
277
307
|
print(f'{name} 报表数据为空')
|
308
|
+
check_remove_file = True
|
278
309
|
continue
|
279
310
|
df.rename(columns={'统计日期': '数据周期'}, inplace=True)
|
280
311
|
# 2024-2-19 官方更新了推广渠道来源名称,自助取数没有更新,这里强制更改
|
@@ -288,47 +319,56 @@ class DatabaseUpdate:
|
|
288
319
|
else x
|
289
320
|
)
|
290
321
|
df['日期'] = df['数据周期'].apply(lambda x: re.findall('(.*) ~', x)[0])
|
322
|
+
check_remove_file = True
|
291
323
|
elif name.endswith('.csv') and 'baobei' in name:
|
292
324
|
# 生意经宝贝指标日数据
|
293
325
|
date = re.findall(r's-(\d{4})(\d{2})(\d{2})\.', str(name))
|
294
326
|
if not date: # 阻止月数据及已转换的表格
|
295
327
|
print(f'{name} 不支持或是已转换的表格')
|
296
328
|
# os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
329
|
+
check_remove_file = True
|
297
330
|
continue
|
298
331
|
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
299
332
|
if len(df) == 0:
|
300
333
|
print(f'{name} 报表数据为空')
|
301
334
|
os.remove(os.path.join(root, name))
|
335
|
+
check_remove_file = True
|
302
336
|
continue
|
303
337
|
if '日期' in df.columns.tolist():
|
304
338
|
df.pop('日期')
|
305
339
|
new_date = '-'.join(date[0])
|
306
340
|
df.insert(loc=0, column='日期', value=new_date)
|
307
341
|
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
342
|
+
check_remove_file = True
|
308
343
|
elif name.endswith('.csv') and '店铺销售指标' in name:
|
309
344
|
# 生意经, 店铺指标,仅限月数据,实际日指标也可以
|
310
345
|
name_st = re.findall(r'(.*)\(分日', name)
|
311
346
|
if not name_st:
|
312
347
|
print(f'{name} 已转换的表格')
|
348
|
+
check_remove_file = True
|
313
349
|
continue
|
314
350
|
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
315
351
|
if len(df) == 0:
|
316
352
|
print(f'{name} 报表数据为空')
|
353
|
+
check_remove_file = True
|
317
354
|
continue
|
318
355
|
df['日期'] = df['日期'].astype(str).apply(
|
319
356
|
lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', x)[0]) if x else x)
|
320
357
|
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
358
|
+
check_remove_file = True
|
321
359
|
elif name.endswith('csv') and '省份城市分析' in name:
|
322
360
|
# 生意经,地域分布, 仅限日数据
|
323
361
|
pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})\.', name)
|
324
362
|
if not pattern or '省份城市分析2' not in name:
|
325
363
|
print(f'{name} 不支持或已转换的表格')
|
326
364
|
# os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
365
|
+
check_remove_file = True
|
327
366
|
continue
|
328
367
|
date = '-'.join(pattern[0][1:])
|
329
368
|
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
330
369
|
if len(df) == 0:
|
331
370
|
print(f'{name} 报表数据为空')
|
371
|
+
check_remove_file = True
|
332
372
|
continue
|
333
373
|
df['省'] = df['省份'].apply(lambda x: x if ' ├─ ' not in x and ' └─ ' not in x else None)
|
334
374
|
df['城市'] = df[['省份', '省']].apply(lambda x: '汇总' if x['省'] else x['省份'], axis=1)
|
@@ -342,12 +382,14 @@ class DatabaseUpdate:
|
|
342
382
|
df['省+市'] = df[['省份', '城市']].apply(lambda x: f'{x["省份"]}-{x["城市"]}', axis=1)
|
343
383
|
df.replace('NAN', 0, inplace=True)
|
344
384
|
df['笔单价'] = df.apply(lambda x: 0 if x['销售量'] == 0 else 0 if x['销售量'] == '0' else x['笔单价'], axis=1)
|
385
|
+
check_remove_file = True
|
345
386
|
elif name.endswith('csv') and 'order' in name:
|
346
387
|
# 生意经,订单数据,仅限月数据
|
347
388
|
pattern = re.findall(r'(.*)(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})', name)
|
348
389
|
if not pattern:
|
349
390
|
print(f'{name} 不支持或已转换的表格')
|
350
391
|
# os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
392
|
+
check_remove_file = True
|
351
393
|
continue
|
352
394
|
date1 = pattern[0][1:4]
|
353
395
|
date1 = '-'.join(date1)
|
@@ -357,6 +399,7 @@ class DatabaseUpdate:
|
|
357
399
|
df = pd.read_csv(os.path.join(root, name), encoding='gb18030', header=0, na_filter=False)
|
358
400
|
if len(df) == 0:
|
359
401
|
print(f'{name} 报表数据为空')
|
402
|
+
check_remove_file = True
|
360
403
|
continue
|
361
404
|
df.insert(loc=0, column='日期', value=date1)
|
362
405
|
df.insert(loc=1, column='数据周期', value=date)
|
@@ -365,30 +408,38 @@ class DatabaseUpdate:
|
|
365
408
|
df.rename(columns={'宝贝标题': '商品标题', '宝贝链接': '商品链接'}, inplace=True)
|
366
409
|
df['颜色编码'] = df['商家编码'].apply(
|
367
410
|
lambda x: ''.join(re.findall(r' .*(\d{4})$', str(x))) if x else x)
|
411
|
+
check_remove_file = True
|
368
412
|
elif name.endswith('.xlsx') and '直播间成交订单明细' in name:
|
369
413
|
# 直播间成交订单明细
|
370
414
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
371
415
|
if len(df) == 0:
|
372
416
|
print(f'{name} 报表数据为空')
|
417
|
+
check_remove_file = True
|
373
418
|
continue
|
374
419
|
df.rename(columns={'场次ID': '场次id', '商品ID': '商品id'}, inplace=True)
|
375
420
|
df['日期'] = df['支付时间'].apply(lambda x: x.strftime('%Y-%m-%d'))
|
421
|
+
check_remove_file = True
|
376
422
|
elif name.endswith('.xlsx') and '直播间大盘数据' in name:
|
377
423
|
# 直播间大盘数据
|
378
424
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
379
425
|
if len(df) == 0:
|
380
426
|
print(f'{name} 报表数据为空')
|
427
|
+
check_remove_file = True
|
381
428
|
continue
|
382
429
|
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
430
|
+
check_remove_file = True
|
383
431
|
elif name.endswith('.xls') and '直播业绩-成交拆解' in name:
|
384
432
|
# 直播业绩-成交拆解
|
385
433
|
df = pd.read_excel(os.path.join(root, name), header=5)
|
386
434
|
if len(df) == 0:
|
387
435
|
print(f'{name} 报表数据为空')
|
436
|
+
check_remove_file = True
|
388
437
|
continue
|
389
438
|
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
439
|
+
check_remove_file = True
|
390
440
|
elif name.endswith('.csv') and '淘宝店铺数据' in name:
|
391
441
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
442
|
+
check_remove_file = True
|
392
443
|
elif name.endswith('.csv') and '人群洞察' in name:
|
393
444
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
394
445
|
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
@@ -397,37 +448,50 @@ class DatabaseUpdate:
|
|
397
448
|
if is_move:
|
398
449
|
try:
|
399
450
|
os.remove(os.path.join(root, name)) # 是否移除原文件
|
451
|
+
check_remove_file = True
|
400
452
|
except Exception as e:
|
401
453
|
print(f'{name}, {e}')
|
402
454
|
continue
|
403
455
|
elif name.endswith('.csv') and '客户_客户概况_画像' in name:
|
404
456
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
457
|
+
check_remove_file = True
|
405
458
|
elif name.endswith('.csv') and '市场排行_店铺' in name:
|
406
459
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
460
|
+
check_remove_file = True
|
407
461
|
elif name.endswith('.csv') and '类目洞察_属性分析_分析明细_商品发现' in name:
|
408
462
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
463
|
+
check_remove_file = True
|
409
464
|
elif name.endswith('.csv') and '类目洞察_属性分析_分析明细_汇总' in name:
|
410
465
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
466
|
+
check_remove_file = True
|
411
467
|
elif name.endswith('.csv') and '类目洞察_价格分析_分析明细_商品发现' in name:
|
412
468
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
469
|
+
check_remove_file = True
|
413
470
|
elif name.endswith('.csv') and '类目洞察_价格分析_分析明细_汇总' in name:
|
414
471
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
472
|
+
check_remove_file = True
|
415
473
|
elif name.endswith('.csv') and '搜索排行_搜索' in name:
|
416
474
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
475
|
+
check_remove_file = True
|
417
476
|
elif name.endswith('.csv') and '竞店分析-销售分析-关键指标对比' in name:
|
418
477
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
478
|
+
check_remove_file = True
|
419
479
|
elif name.endswith('.csv') and '竞店分析-销售分析-top商品榜' in name:
|
420
480
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
481
|
+
check_remove_file = True
|
421
482
|
elif name.endswith('.csv') and '竞店分析-来源分析-入店来源' in name:
|
422
483
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
484
|
+
check_remove_file = True
|
423
485
|
elif name.endswith('.csv') and '竞店分析-来源分析-入店搜索词' in name:
|
424
486
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
487
|
+
check_remove_file = True
|
425
488
|
# ----------------------- 京东数据处理分界线 -----------------------
|
426
489
|
# ----------------------- 京东数据处理分界线 -----------------------
|
427
490
|
elif name.endswith('.xlsx') and '店铺来源_流量来源' in name:
|
428
491
|
# 京东店铺来源
|
429
492
|
if '按天' not in name:
|
430
493
|
print(f'{name} 京东流量请按天下载')
|
494
|
+
check_remove_file = True
|
431
495
|
continue
|
432
496
|
date01 = re.findall(r'(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
|
433
497
|
new_date01 = f'{date01[0][0]}-{date01[0][1]}-{date01[0][2]}'
|
@@ -436,6 +500,7 @@ class DatabaseUpdate:
|
|
436
500
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
437
501
|
if len(df) == 0:
|
438
502
|
print(f'{name} 报表数据为空')
|
503
|
+
check_remove_file = True
|
439
504
|
continue
|
440
505
|
df.insert(loc=0, column='日期', value=new_date01)
|
441
506
|
if new_date01 != new_date02:
|
@@ -444,17 +509,20 @@ class DatabaseUpdate:
|
|
444
509
|
for col_2024 in cols: # 京东这个表有字段加了去年日期,删除这些同比数据字段,不然列数量爆炸
|
445
510
|
if '20' in col_2024 and '流量来源' in name:
|
446
511
|
df.drop(col_2024, axis=1, inplace=True)
|
512
|
+
check_remove_file = True
|
447
513
|
elif name.endswith('.xlsx') and '全部渠道_商品明细' in name:
|
448
514
|
# 京东商品明细 文件转换
|
449
515
|
date1 = re.findall(r'_(\d{4})(\d{2})(\d{2})_全部', str(name))
|
450
516
|
if not date1[0]:
|
451
517
|
print(f'{name}: 仅支持日数据')
|
518
|
+
check_remove_file = True
|
452
519
|
continue
|
453
520
|
if date1:
|
454
521
|
date1 = f'{date1[0][0]}-{date1[0][1]}-{date1[0][2]}'
|
455
522
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
456
523
|
if len(df) == 0:
|
457
524
|
print(f'{name} 报表数据为空')
|
525
|
+
check_remove_file = True
|
458
526
|
continue
|
459
527
|
if '10035975359247' in df['商品ID'].values or '10056642622343' in df['商品ID'].values:
|
460
528
|
new_name = f'sku_{date1}_全部渠道_商品明细.csv'
|
@@ -471,30 +539,37 @@ class DatabaseUpdate:
|
|
471
539
|
elif 'spu' in new_name:
|
472
540
|
db_name = '京东数据2'
|
473
541
|
collection_name = 'spu_商品明细'
|
542
|
+
check_remove_file = True
|
474
543
|
elif name.endswith('.xlsx') and '搜索分析-排名定位-商品词下排名' in name:
|
475
544
|
# 京东商品词下排名
|
476
545
|
try:
|
477
546
|
pattern = re.findall(r'(\d{4}-\d{2}-\d{2})-(\d{4}-\d{2}-\d{2})', name)
|
478
547
|
if not pattern:
|
548
|
+
check_remove_file = True
|
479
549
|
continue
|
480
550
|
if pattern[0][0] == pattern[0][1]:
|
481
551
|
print(f'{name}: 检测到数据周期异常,仅支持7天数据')
|
552
|
+
check_remove_file = True
|
482
553
|
continue
|
483
554
|
df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
|
484
555
|
if len(df) == 0:
|
485
556
|
print(f'{name} 报表数据为空')
|
557
|
+
check_remove_file = True
|
486
558
|
continue
|
487
559
|
if len(df.columns.tolist()) < 20:
|
488
560
|
print(f'{name}: 报表可能缺失诊断数据')
|
489
561
|
os.remove(os.path.join(root, name))
|
562
|
+
check_remove_file = True
|
490
563
|
continue
|
491
564
|
df.rename(columns={'商品的ID': 'skuid'}, inplace=True)
|
492
565
|
for col in ['词人气', '搜索点击率']:
|
493
566
|
if col in df.columns.tolist():
|
494
567
|
df[col] = df[col].apply(lambda x: round(x, 6) if x else x)
|
568
|
+
check_remove_file = True
|
495
569
|
except Exception as e:
|
496
570
|
print(e)
|
497
571
|
print(name, '报错')
|
572
|
+
check_remove_file = True
|
498
573
|
continue
|
499
574
|
elif name.endswith('.xlsx') and '搜索分析-排名定位-商品排名' in name:
|
500
575
|
# 京东商品排名
|
@@ -502,11 +577,13 @@ class DatabaseUpdate:
|
|
502
577
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
503
578
|
if len(df) == 0:
|
504
579
|
print(f'{name} 报表数据为空')
|
580
|
+
check_remove_file = True
|
505
581
|
continue
|
506
582
|
df.insert(0, '日期', date_in) # 插入新列
|
507
583
|
df.rename(columns={'SKU': 'skuid'}, inplace=True)
|
508
584
|
if '点击率' in df.columns.tolist():
|
509
585
|
df['点击率'] = df['点击率'].apply(lambda x: round(x, 6) if x else x)
|
586
|
+
check_remove_file = True
|
510
587
|
elif name.endswith('.xls') and '竞店概况_竞店详情' in name:
|
511
588
|
# 京东,竞争-竞店概况-竞店详情-全部渠道
|
512
589
|
date01 = re.findall(r'全部渠道_(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
|
@@ -515,68 +592,85 @@ class DatabaseUpdate:
|
|
515
592
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
516
593
|
if len(df) == 0:
|
517
594
|
print(f'{name} 报表数据为空')
|
595
|
+
check_remove_file = True
|
518
596
|
continue
|
519
597
|
df.insert(loc=0, column='日期', value=start_date)
|
598
|
+
check_remove_file = True
|
520
599
|
elif name.endswith('.xls') and 'JD店铺日报_店铺' in name:
|
521
600
|
# 京东 自助报表 店铺日报
|
522
601
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
523
602
|
if len(df) == 0:
|
524
603
|
print(f'{name} 报表数据为空')
|
604
|
+
check_remove_file = True
|
525
605
|
continue
|
526
606
|
df['日期'] = df['日期'].apply(
|
527
607
|
lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', str(x))[0])
|
528
608
|
)
|
609
|
+
check_remove_file = True
|
529
610
|
elif name.endswith('.xls') and '商家榜单_女包_整体' in name:
|
530
611
|
# 京东 行业 商家榜单
|
531
612
|
date2 = re.findall(r'_\d{8}-\d+', name)
|
532
613
|
if date2:
|
533
614
|
print(f'{name}: 请下载日数据,不支持其他周期')
|
534
615
|
# os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
616
|
+
check_remove_file = True
|
535
617
|
continue
|
536
618
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
537
619
|
if len(df) == 0:
|
538
620
|
print(f'{name} 报表数据为空')
|
621
|
+
check_remove_file = True
|
539
622
|
continue
|
540
623
|
df['日期'] = df['日期'].astype(str).apply(lambda x: f'{x[:4]}-{x[4:6]}-{x[6:8]}')
|
541
624
|
df.insert(loc=0, column='类型', value='商家榜单')
|
625
|
+
check_remove_file = True
|
542
626
|
elif name.endswith('.xlsx') and '批量SKU导出-批量任务' in name:
|
543
627
|
# 京东 sku 导出
|
544
628
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
545
629
|
if len(df) == 0:
|
546
630
|
print(f'{name} 报表数据为空')
|
631
|
+
check_remove_file = True
|
547
632
|
continue
|
548
633
|
d_time = datetime.datetime.today().strftime('%Y-%m-%d')
|
549
634
|
df.insert(loc=0, column='日期', value=d_time)
|
550
635
|
df['商品链接'] = df['商品链接'].apply(lambda x: f'https://{x}' if x else x)
|
636
|
+
check_remove_file = True
|
551
637
|
elif name.endswith('.xlsx') and '批量SPU导出-批量任务' in name:
|
552
638
|
# 京东 spu 导出
|
553
639
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
554
640
|
if len(df) == 0:
|
555
641
|
print(f'{name} 报表数据为空')
|
642
|
+
check_remove_file = True
|
556
643
|
continue
|
557
644
|
d_time = datetime.datetime.today().strftime('%Y-%m-%d')
|
558
645
|
df.insert(loc=0, column='日期', value=d_time)
|
646
|
+
check_remove_file = True
|
559
647
|
elif name.endswith('.csv') and '万里马箱包推广1_完整点击成交' in name:
|
560
648
|
# 京东推广数据
|
561
649
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
562
650
|
if len(df) == 0:
|
563
651
|
print(f'{name} 报表数据为空')
|
652
|
+
check_remove_file = True
|
564
653
|
continue
|
565
654
|
df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
|
655
|
+
check_remove_file = True
|
566
656
|
elif name.endswith('.csv') and '万里马箱包推广1_京东推广搜索词_pbix同步不要' in name:
|
567
657
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
568
658
|
if len(df) == 0:
|
569
659
|
print(f'{name} 报表数据为空')
|
660
|
+
check_remove_file = True
|
570
661
|
continue
|
571
662
|
df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
|
572
663
|
df['是否品牌词'] = df['搜索词'].str.contains('万里马|wanlima', regex=True)
|
573
664
|
df['是否品牌词'] = df['是否品牌词'].apply(lambda x: '品牌词' if x else '')
|
665
|
+
check_remove_file = True
|
574
666
|
elif name.endswith('.xlsx') and '零售明细统计' in name:
|
575
667
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
576
668
|
if len(df) == 0:
|
577
669
|
print(f'{name} 报表数据为空')
|
670
|
+
check_remove_file = True
|
578
671
|
continue
|
579
672
|
df = df[df['缩略图'] != '合计']
|
673
|
+
check_remove_file = True
|
580
674
|
elif name.endswith('.csv') and '营销概况_全站营销' in name:
|
581
675
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=1, na_filter=False)
|
582
676
|
df = df[(df['日期'] != '日期') & (df['日期'] != '汇总') & (df['日期'] != '0') & (df['花费'] != '0') & (df['花费'] != '0.00')]
|
@@ -584,6 +678,7 @@ class DatabaseUpdate:
|
|
584
678
|
df.drop("'当前时间'", axis=1, inplace=True)
|
585
679
|
df.rename(columns={'全站ROI': '全站roi'}, inplace=True)
|
586
680
|
df.insert(loc=1, column='产品线', value='全站营销')
|
681
|
+
check_remove_file = True
|
587
682
|
elif name.endswith('.csv') and '关键词点击成交报表_pbix同步_勿删改' in name:
|
588
683
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
589
684
|
for col in df.columns.tolist():
|
@@ -595,6 +690,7 @@ class DatabaseUpdate:
|
|
595
690
|
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
|
596
691
|
# min_clm = str(df['日期'].min()).split(' ')[0]
|
597
692
|
# max_clm = str(df['日期'].max()).split(' ')[0]
|
693
|
+
check_remove_file = True
|
598
694
|
|
599
695
|
# 商品素材,必须保持放在最后处理
|
600
696
|
elif name.endswith('xlsx'):
|
@@ -619,8 +715,9 @@ class DatabaseUpdate:
|
|
619
715
|
collection_name = '商品素材导出'
|
620
716
|
else:
|
621
717
|
df = pd.DataFrame()
|
718
|
+
check_remove_file = True
|
622
719
|
|
623
|
-
if is_move:
|
720
|
+
if is_move and check_remove_file:
|
624
721
|
try:
|
625
722
|
os.remove(os.path.join(root, name)) # 是否移除原文件
|
626
723
|
except Exception as e:
|
mdbq/aggregation/query_data.py
CHANGED
@@ -1061,7 +1061,7 @@ class GroupBy:
|
|
1061
1061
|
)
|
1062
1062
|
return df
|
1063
1063
|
elif '直播场次分析' in table_name:
|
1064
|
-
df.drop_duplicates(subset=['
|
1064
|
+
df.drop_duplicates(subset=['场次id'], keep='first', inplace=True, ignore_index=True)
|
1065
1065
|
return df
|
1066
1066
|
else:
|
1067
1067
|
print(f'<{table_name}>: Groupby 类尚未配置,数据为空')
|
@@ -1434,7 +1434,7 @@ def data_aggregation(service_databases=[{}], months=1):
|
|
1434
1434
|
{
|
1435
1435
|
'数据库名': '聚合数据',
|
1436
1436
|
'集合名': '生意参谋_直播场次分析',
|
1437
|
-
'唯一主键': ['
|
1437
|
+
'唯一主键': ['场次id'],
|
1438
1438
|
'数据主体': sdq.zb_ccfx(),
|
1439
1439
|
},
|
1440
1440
|
]
|
@@ -1,11 +1,11 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
2
|
mdbq/__version__.py,sha256=y9Mp_8x0BCZSHsdLT_q5tX9wZwd5QgqrSIENLrb6vXA,62
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/aggregation.py,sha256=
|
4
|
+
mdbq/aggregation/aggregation.py,sha256=KxAxlhtYtJKjTgWWmff5BlO8PD_FpGmMxzxVYASrFQ8,71917
|
5
5
|
mdbq/aggregation/df_types.py,sha256=oQJS2IBU3_IO6GMgbssHuC2yCjNnbta0QPGrFOwNLnU,7591
|
6
6
|
mdbq/aggregation/mysql_types.py,sha256=DQYROALDiwjJzjhaJfIIdnsrNs11i5BORlj_v6bp67Y,11062
|
7
7
|
mdbq/aggregation/optimize_data.py,sha256=u2Kl_MFtZueXJ57ycy4H2OhXD431RctUYJYCl637uT0,4176
|
8
|
-
mdbq/aggregation/query_data.py,sha256=
|
8
|
+
mdbq/aggregation/query_data.py,sha256=32NjVVYLnfFkzD8TflmNVhpdQTLRRUrb9toMGApSOC8,72379
|
9
9
|
mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
|
10
10
|
mdbq/bdup/bdup.py,sha256=LAV0TgnQpc-LB-YuJthxb0U42_VkPidzQzAagan46lU,4234
|
11
11
|
mdbq/clean/__init__.py,sha256=A1d6x3L27j4NtLgiFV5TANwEkLuaDfPHDQNrPBbNWtU,41
|
@@ -36,7 +36,7 @@ mdbq/pbix/__init__.py,sha256=Trtfaynu9RjoTyLLYBN2xdRxTvm_zhCniUkVTAYwcjo,24
|
|
36
36
|
mdbq/pbix/pbix_refresh.py,sha256=JUjKW3bNEyoMVfVfo77UhguvS5AWkixvVhDbw4_MHco,2396
|
37
37
|
mdbq/pbix/refresh_all.py,sha256=0uAnBKCd5cx5FLTkawN1GV9yi87rfyMgYal5LABtumQ,7186
|
38
38
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
39
|
-
mdbq-1.9.
|
40
|
-
mdbq-1.9.
|
41
|
-
mdbq-1.9.
|
42
|
-
mdbq-1.9.
|
39
|
+
mdbq-1.9.6.dist-info/METADATA,sha256=fdqv73OvAg866cfG7kXzJDdg27mvNxWTgFzulIi1B6Y,245
|
40
|
+
mdbq-1.9.6.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
41
|
+
mdbq-1.9.6.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
42
|
+
mdbq-1.9.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|