mdbq 1.9.5__py3-none-any.whl → 1.9.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -59,6 +59,7 @@ class DatabaseUpdate:
59
59
 
60
60
  for root, dirs, files in os.walk(self.path, topdown=False):
61
61
  for name in files:
62
+ check_remove_file = False # 设置这个参数的目的: 避免误删其他文件, 不是本程序数据清洗覆盖的文件不做干预
62
63
  if '~$' in name or '.DS' in name or '.localized' in name or '.ini' in name or '$RECYCLE.BIN' in name or 'Icon' in name:
63
64
  continue
64
65
  db_name = None # 初始化/重置变量,避免进入下一个循环
@@ -93,17 +94,21 @@ class DatabaseUpdate:
93
94
  ck = df.columns.tolist()
94
95
  if '场景名字' not in ck:
95
96
  print(f'1.2.0 {name} 报表字段缺失, 请选择Pbix数据模板下载')
97
+ check_remove_file = True
96
98
  continue
97
99
  if len(df) == 0:
98
100
  print(f'1.3.0 {name} 报表是空的, 请重新下载')
101
+ check_remove_file = True
99
102
  continue
100
103
  cols = df.columns.tolist()
101
104
  if '日期' not in cols:
102
105
  print(f'1.4.0 {name} 报表不包含分日数据, 已跳过')
106
+ check_remove_file = True
103
107
  continue
104
108
  if '省' in cols:
105
109
  if '市' not in cols:
106
110
  print(f'1.5.0 {name} 请下载市级地域报表,而不是省报表')
111
+ check_remove_file = True
107
112
  continue
108
113
  # df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
109
114
  # df.replace(to_replace=[''], value=0, regex=False, inplace=True)
@@ -114,11 +119,13 @@ class DatabaseUpdate:
114
119
  else:
115
120
  db_name = '推广数据2'
116
121
  collection_name = f'{tg_name}'
122
+ check_remove_file = True
117
123
  if name.endswith('.csv') and '超级直播' in name:
118
124
  # 超级直播
119
125
  df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
120
126
  if len(df) == 0:
121
127
  print(f'{name} 报表数据为空')
128
+ check_remove_file = True
122
129
  continue
123
130
  pattern = re.findall(r'(.*_)\d{8}_\d{6}', name)
124
131
  if not pattern: # 说明已经转换过
@@ -130,27 +137,34 @@ class DatabaseUpdate:
130
137
  shop_name = ''
131
138
  # df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
132
139
  # df.replace(to_replace=[''], value=0, regex=False, inplace=True)
140
+ check_remove_file = True
133
141
  elif name.endswith('.xls') and '短直联投' in name:
134
142
  # 短直联投
135
143
  df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
136
144
  df = pd.concat(df)
137
145
  if len(df) == 0:
138
146
  print(f'{name} 报表数据为空')
147
+ check_remove_file = True
139
148
  continue
140
149
  # df.replace(to_replace=[''], value=0, regex=False, inplace=True)
150
+ check_remove_file = True
141
151
  elif name.endswith('.xls') and '视频加速推广' in name:
142
152
  # 超级短视频
143
153
  df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
144
154
  df = pd.concat(df)
145
155
  if len(df) == 0:
146
156
  print(f'{name} 报表数据为空')
157
+ check_remove_file = True
147
158
  continue
148
159
  # df.replace(to_replace=[''], value=0, regex=False, inplace=True)
160
+ check_remove_file = True
149
161
  if '人群报表汇总' in name:
150
162
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=1, na_filter=False)
151
163
  if len(df) == 0:
152
164
  print(f'{name} 报表数据为空')
165
+ check_remove_file = True
153
166
  continue
167
+ check_remove_file = True
154
168
  # ----------------- 推广报表 分割线 -----------------
155
169
  # ----------------- 推广报表 分割线 -----------------
156
170
  date01 = re.findall(r'(\d{4}-\d{2}-\d{2})_\d{4}-\d{2}-\d{2}', str(name))
@@ -161,6 +175,7 @@ class DatabaseUpdate:
161
175
  df = pd.read_excel(os.path.join(root, name), header=5)
162
176
  if len(df) == 0:
163
177
  print(f'{name} 报表数据为空')
178
+ check_remove_file = True
164
179
  continue
165
180
  # df.replace(to_replace=['-'], value=0, regex=False, inplace=True)
166
181
  # df.replace(to_replace=[','], value='', regex=True, inplace=True)
@@ -186,15 +201,19 @@ class DatabaseUpdate:
186
201
  collection_name='店铺来源_月数据_旧版'
187
202
  else:
188
203
  collection_name='店铺来源_日数据_旧版'
204
+ check_remove_file = True
189
205
  elif name.endswith('.csv') and '客户运营平台_客户列表' in name:
190
206
  df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
207
+ check_remove_file = True
191
208
  elif name.endswith('.xlsx') and '直播分场次效果' in name:
192
209
  pattern = re.findall(r'(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})', name)
193
210
  if pattern:
211
+ check_remove_file = True
194
212
  continue
195
213
  df = pd.read_excel(os.path.join(root, name), header=0)
196
214
  if len(df) == 0:
197
215
  print(f'{name} 报表数据为空')
216
+ check_remove_file = True
198
217
  continue
199
218
  df.replace(to_replace=['--'], value='', regex=False, inplace=True)
200
219
  df.replace(to_replace=[','], value='', regex=True, inplace=True)
@@ -203,6 +222,7 @@ class DatabaseUpdate:
203
222
  df['日期'] = df['日期'].apply(
204
223
  lambda x: pd.to_datetime(str(x).split(' ')[0], format='%Y-%m-%d', errors='ignore') if x else x)
205
224
  df.insert(loc=1, column='店铺', value='万里马官方旗舰店')
225
+ check_remove_file = True
206
226
 
207
227
  elif name.endswith('.xls') and '生意参谋' in name and '无线店铺三级流量来源详情' in name:
208
228
  # 店铺来源,手淘搜索,关键词
@@ -210,6 +230,7 @@ class DatabaseUpdate:
210
230
  df = pd.read_excel(os.path.join(root, name), header=5)
211
231
  if len(df) == 0:
212
232
  print(f'{name} 报表数据为空')
233
+ check_remove_file = True
213
234
  continue
214
235
  df.replace(to_replace=[','], value='', regex=True, inplace=True)
215
236
  df.insert(loc=0, column='日期', value=pattern[0][1])
@@ -221,12 +242,14 @@ class DatabaseUpdate:
221
242
  if pattern[0][0] != pattern[0][1]:
222
243
  data_lis = pattern[0][0] + '_' + pattern[0][1]
223
244
  df.insert(loc=1, column='数据周期', value=data_lis)
245
+ check_remove_file = True
224
246
 
225
247
  elif name.endswith('.xls') and '生意参谋' in name and '商品_全部' in name:
226
248
  # 店铺商品排行
227
249
  df = pd.read_excel(os.path.join(root, name), header=4)
228
250
  if len(df) == 0:
229
251
  print(f'{name} 报表数据为空')
252
+ check_remove_file = True
230
253
  continue
231
254
  # df.replace(to_replace=['-'], value=0, regex=False, inplace=True)
232
255
  # df.replace(to_replace=[','], value='', regex=True, inplace=True)
@@ -234,18 +257,22 @@ class DatabaseUpdate:
234
257
  if date01[0] != date02[0]:
235
258
  data_lis = date01[0] + '_' + date02[0]
236
259
  df.insert(loc=1, column='数据周期', value=data_lis)
260
+ check_remove_file = True
237
261
  elif name.endswith('.xls') and '参谋店铺整体日报' in name:
238
262
  # 自助取数,店铺日报
239
263
  df = pd.read_excel(os.path.join(root, name), header=7)
240
264
  if len(df) == 0:
241
265
  print(f'{name} 报表数据为空')
266
+ check_remove_file = True
242
267
  continue
243
268
  df.rename(columns={'统计日期': '日期'}, inplace=True)
269
+ check_remove_file = True
244
270
  elif name.endswith('.xls') and '参谋每日流量_自助取数_新版' in name:
245
271
  # 自助取数,每日流量
246
272
  df = pd.read_excel(os.path.join(root, name), header=7)
247
273
  if len(df) == 0:
248
274
  print(f'{name} 报表数据为空')
275
+ check_remove_file = True
249
276
  continue
250
277
  df.rename(columns={'统计日期': '日期'}, inplace=True)
251
278
  # 2024-2-19 官方更新了推广渠道来源名称,自助取数没有更新,这里强制更改
@@ -258,11 +285,13 @@ class DatabaseUpdate:
258
285
  else '智能场景' if x == '智能场景(原万相台)'
259
286
  else x
260
287
  )
288
+ check_remove_file = True
261
289
  elif name.endswith('.xls') and '商品sku' in name:
262
290
  # 自助取数,商品sku
263
291
  df = pd.read_excel(os.path.join(root, name), header=7)
264
292
  if len(df) == 0:
265
293
  print(f'{name} 报表数据为空')
294
+ check_remove_file = True
266
295
  continue
267
296
  df.rename(columns={
268
297
  '统计日期': '日期',
@@ -270,11 +299,13 @@ class DatabaseUpdate:
270
299
  'SKU ID': 'sku id',
271
300
  '商品SKU': '商品sku',
272
301
  }, inplace=True)
302
+ check_remove_file = True
273
303
  elif name.endswith('.xls') and '参谋店铺流量来源(月)' in name:
274
304
  # 自助取数,月店铺流量来源
275
305
  df = pd.read_excel(os.path.join(root, name), header=7)
276
306
  if len(df) == 0:
277
307
  print(f'{name} 报表数据为空')
308
+ check_remove_file = True
278
309
  continue
279
310
  df.rename(columns={'统计日期': '数据周期'}, inplace=True)
280
311
  # 2024-2-19 官方更新了推广渠道来源名称,自助取数没有更新,这里强制更改
@@ -288,47 +319,56 @@ class DatabaseUpdate:
288
319
  else x
289
320
  )
290
321
  df['日期'] = df['数据周期'].apply(lambda x: re.findall('(.*) ~', x)[0])
322
+ check_remove_file = True
291
323
  elif name.endswith('.csv') and 'baobei' in name:
292
324
  # 生意经宝贝指标日数据
293
325
  date = re.findall(r's-(\d{4})(\d{2})(\d{2})\.', str(name))
294
326
  if not date: # 阻止月数据及已转换的表格
295
327
  print(f'{name} 不支持或是已转换的表格')
296
328
  # os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
329
+ check_remove_file = True
297
330
  continue
298
331
  df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
299
332
  if len(df) == 0:
300
333
  print(f'{name} 报表数据为空')
301
334
  os.remove(os.path.join(root, name))
335
+ check_remove_file = True
302
336
  continue
303
337
  if '日期' in df.columns.tolist():
304
338
  df.pop('日期')
305
339
  new_date = '-'.join(date[0])
306
340
  df.insert(loc=0, column='日期', value=new_date)
307
341
  df.replace(to_replace=['--'], value='', regex=False, inplace=True)
342
+ check_remove_file = True
308
343
  elif name.endswith('.csv') and '店铺销售指标' in name:
309
344
  # 生意经, 店铺指标,仅限月数据,实际日指标也可以
310
345
  name_st = re.findall(r'(.*)\(分日', name)
311
346
  if not name_st:
312
347
  print(f'{name} 已转换的表格')
348
+ check_remove_file = True
313
349
  continue
314
350
  df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
315
351
  if len(df) == 0:
316
352
  print(f'{name} 报表数据为空')
353
+ check_remove_file = True
317
354
  continue
318
355
  df['日期'] = df['日期'].astype(str).apply(
319
356
  lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', x)[0]) if x else x)
320
357
  df.replace(to_replace=['--'], value='', regex=False, inplace=True)
358
+ check_remove_file = True
321
359
  elif name.endswith('csv') and '省份城市分析' in name:
322
360
  # 生意经,地域分布, 仅限日数据
323
361
  pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})\.', name)
324
362
  if not pattern or '省份城市分析2' not in name:
325
363
  print(f'{name} 不支持或已转换的表格')
326
364
  # os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
365
+ check_remove_file = True
327
366
  continue
328
367
  date = '-'.join(pattern[0][1:])
329
368
  df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
330
369
  if len(df) == 0:
331
370
  print(f'{name} 报表数据为空')
371
+ check_remove_file = True
332
372
  continue
333
373
  df['省'] = df['省份'].apply(lambda x: x if ' ├─ ' not in x and ' └─ ' not in x else None)
334
374
  df['城市'] = df[['省份', '省']].apply(lambda x: '汇总' if x['省'] else x['省份'], axis=1)
@@ -342,12 +382,14 @@ class DatabaseUpdate:
342
382
  df['省+市'] = df[['省份', '城市']].apply(lambda x: f'{x["省份"]}-{x["城市"]}', axis=1)
343
383
  df.replace('NAN', 0, inplace=True)
344
384
  df['笔单价'] = df.apply(lambda x: 0 if x['销售量'] == 0 else 0 if x['销售量'] == '0' else x['笔单价'], axis=1)
385
+ check_remove_file = True
345
386
  elif name.endswith('csv') and 'order' in name:
346
387
  # 生意经,订单数据,仅限月数据
347
388
  pattern = re.findall(r'(.*)(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})', name)
348
389
  if not pattern:
349
390
  print(f'{name} 不支持或已转换的表格')
350
391
  # os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
392
+ check_remove_file = True
351
393
  continue
352
394
  date1 = pattern[0][1:4]
353
395
  date1 = '-'.join(date1)
@@ -357,6 +399,7 @@ class DatabaseUpdate:
357
399
  df = pd.read_csv(os.path.join(root, name), encoding='gb18030', header=0, na_filter=False)
358
400
  if len(df) == 0:
359
401
  print(f'{name} 报表数据为空')
402
+ check_remove_file = True
360
403
  continue
361
404
  df.insert(loc=0, column='日期', value=date1)
362
405
  df.insert(loc=1, column='数据周期', value=date)
@@ -365,30 +408,38 @@ class DatabaseUpdate:
365
408
  df.rename(columns={'宝贝标题': '商品标题', '宝贝链接': '商品链接'}, inplace=True)
366
409
  df['颜色编码'] = df['商家编码'].apply(
367
410
  lambda x: ''.join(re.findall(r' .*(\d{4})$', str(x))) if x else x)
411
+ check_remove_file = True
368
412
  elif name.endswith('.xlsx') and '直播间成交订单明细' in name:
369
413
  # 直播间成交订单明细
370
414
  df = pd.read_excel(os.path.join(root, name), header=0)
371
415
  if len(df) == 0:
372
416
  print(f'{name} 报表数据为空')
417
+ check_remove_file = True
373
418
  continue
374
419
  df.rename(columns={'场次ID': '场次id', '商品ID': '商品id'}, inplace=True)
375
420
  df['日期'] = df['支付时间'].apply(lambda x: x.strftime('%Y-%m-%d'))
421
+ check_remove_file = True
376
422
  elif name.endswith('.xlsx') and '直播间大盘数据' in name:
377
423
  # 直播间大盘数据
378
424
  df = pd.read_excel(os.path.join(root, name), header=0)
379
425
  if len(df) == 0:
380
426
  print(f'{name} 报表数据为空')
427
+ check_remove_file = True
381
428
  continue
382
429
  df.rename(columns={'统计日期': '日期'}, inplace=True)
430
+ check_remove_file = True
383
431
  elif name.endswith('.xls') and '直播业绩-成交拆解' in name:
384
432
  # 直播业绩-成交拆解
385
433
  df = pd.read_excel(os.path.join(root, name), header=5)
386
434
  if len(df) == 0:
387
435
  print(f'{name} 报表数据为空')
436
+ check_remove_file = True
388
437
  continue
389
438
  df.rename(columns={'统计日期': '日期'}, inplace=True)
439
+ check_remove_file = True
390
440
  elif name.endswith('.csv') and '淘宝店铺数据' in name:
391
441
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
442
+ check_remove_file = True
392
443
  elif name.endswith('.csv') and '人群洞察' in name:
393
444
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
394
445
  df.replace(to_replace=['--'], value='', regex=False, inplace=True)
@@ -397,37 +448,50 @@ class DatabaseUpdate:
397
448
  if is_move:
398
449
  try:
399
450
  os.remove(os.path.join(root, name)) # 是否移除原文件
451
+ check_remove_file = True
400
452
  except Exception as e:
401
453
  print(f'{name}, {e}')
402
454
  continue
403
455
  elif name.endswith('.csv') and '客户_客户概况_画像' in name:
404
456
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
457
+ check_remove_file = True
405
458
  elif name.endswith('.csv') and '市场排行_店铺' in name:
406
459
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
460
+ check_remove_file = True
407
461
  elif name.endswith('.csv') and '类目洞察_属性分析_分析明细_商品发现' in name:
408
462
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
463
+ check_remove_file = True
409
464
  elif name.endswith('.csv') and '类目洞察_属性分析_分析明细_汇总' in name:
410
465
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
466
+ check_remove_file = True
411
467
  elif name.endswith('.csv') and '类目洞察_价格分析_分析明细_商品发现' in name:
412
468
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
469
+ check_remove_file = True
413
470
  elif name.endswith('.csv') and '类目洞察_价格分析_分析明细_汇总' in name:
414
471
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
472
+ check_remove_file = True
415
473
  elif name.endswith('.csv') and '搜索排行_搜索' in name:
416
474
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
475
+ check_remove_file = True
417
476
  elif name.endswith('.csv') and '竞店分析-销售分析-关键指标对比' in name:
418
477
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
478
+ check_remove_file = True
419
479
  elif name.endswith('.csv') and '竞店分析-销售分析-top商品榜' in name:
420
480
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
481
+ check_remove_file = True
421
482
  elif name.endswith('.csv') and '竞店分析-来源分析-入店来源' in name:
422
483
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
484
+ check_remove_file = True
423
485
  elif name.endswith('.csv') and '竞店分析-来源分析-入店搜索词' in name:
424
486
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
487
+ check_remove_file = True
425
488
  # ----------------------- 京东数据处理分界线 -----------------------
426
489
  # ----------------------- 京东数据处理分界线 -----------------------
427
490
  elif name.endswith('.xlsx') and '店铺来源_流量来源' in name:
428
491
  # 京东店铺来源
429
492
  if '按天' not in name:
430
493
  print(f'{name} 京东流量请按天下载')
494
+ check_remove_file = True
431
495
  continue
432
496
  date01 = re.findall(r'(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
433
497
  new_date01 = f'{date01[0][0]}-{date01[0][1]}-{date01[0][2]}'
@@ -436,6 +500,7 @@ class DatabaseUpdate:
436
500
  df = pd.read_excel(os.path.join(root, name), header=0)
437
501
  if len(df) == 0:
438
502
  print(f'{name} 报表数据为空')
503
+ check_remove_file = True
439
504
  continue
440
505
  df.insert(loc=0, column='日期', value=new_date01)
441
506
  if new_date01 != new_date02:
@@ -444,17 +509,20 @@ class DatabaseUpdate:
444
509
  for col_2024 in cols: # 京东这个表有字段加了去年日期,删除这些同比数据字段,不然列数量爆炸
445
510
  if '20' in col_2024 and '流量来源' in name:
446
511
  df.drop(col_2024, axis=1, inplace=True)
512
+ check_remove_file = True
447
513
  elif name.endswith('.xlsx') and '全部渠道_商品明细' in name:
448
514
  # 京东商品明细 文件转换
449
515
  date1 = re.findall(r'_(\d{4})(\d{2})(\d{2})_全部', str(name))
450
516
  if not date1[0]:
451
517
  print(f'{name}: 仅支持日数据')
518
+ check_remove_file = True
452
519
  continue
453
520
  if date1:
454
521
  date1 = f'{date1[0][0]}-{date1[0][1]}-{date1[0][2]}'
455
522
  df = pd.read_excel(os.path.join(root, name), header=0)
456
523
  if len(df) == 0:
457
524
  print(f'{name} 报表数据为空')
525
+ check_remove_file = True
458
526
  continue
459
527
  if '10035975359247' in df['商品ID'].values or '10056642622343' in df['商品ID'].values:
460
528
  new_name = f'sku_{date1}_全部渠道_商品明细.csv'
@@ -471,30 +539,37 @@ class DatabaseUpdate:
471
539
  elif 'spu' in new_name:
472
540
  db_name = '京东数据2'
473
541
  collection_name = 'spu_商品明细'
542
+ check_remove_file = True
474
543
  elif name.endswith('.xlsx') and '搜索分析-排名定位-商品词下排名' in name:
475
544
  # 京东商品词下排名
476
545
  try:
477
546
  pattern = re.findall(r'(\d{4}-\d{2}-\d{2})-(\d{4}-\d{2}-\d{2})', name)
478
547
  if not pattern:
548
+ check_remove_file = True
479
549
  continue
480
550
  if pattern[0][0] == pattern[0][1]:
481
551
  print(f'{name}: 检测到数据周期异常,仅支持7天数据')
552
+ check_remove_file = True
482
553
  continue
483
554
  df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
484
555
  if len(df) == 0:
485
556
  print(f'{name} 报表数据为空')
557
+ check_remove_file = True
486
558
  continue
487
559
  if len(df.columns.tolist()) < 20:
488
560
  print(f'{name}: 报表可能缺失诊断数据')
489
561
  os.remove(os.path.join(root, name))
562
+ check_remove_file = True
490
563
  continue
491
564
  df.rename(columns={'商品的ID': 'skuid'}, inplace=True)
492
565
  for col in ['词人气', '搜索点击率']:
493
566
  if col in df.columns.tolist():
494
567
  df[col] = df[col].apply(lambda x: round(x, 6) if x else x)
568
+ check_remove_file = True
495
569
  except Exception as e:
496
570
  print(e)
497
571
  print(name, '报错')
572
+ check_remove_file = True
498
573
  continue
499
574
  elif name.endswith('.xlsx') and '搜索分析-排名定位-商品排名' in name:
500
575
  # 京东商品排名
@@ -502,11 +577,13 @@ class DatabaseUpdate:
502
577
  df = pd.read_excel(os.path.join(root, name), header=0)
503
578
  if len(df) == 0:
504
579
  print(f'{name} 报表数据为空')
580
+ check_remove_file = True
505
581
  continue
506
582
  df.insert(0, '日期', date_in) # 插入新列
507
583
  df.rename(columns={'SKU': 'skuid'}, inplace=True)
508
584
  if '点击率' in df.columns.tolist():
509
585
  df['点击率'] = df['点击率'].apply(lambda x: round(x, 6) if x else x)
586
+ check_remove_file = True
510
587
  elif name.endswith('.xls') and '竞店概况_竞店详情' in name:
511
588
  # 京东,竞争-竞店概况-竞店详情-全部渠道
512
589
  date01 = re.findall(r'全部渠道_(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
@@ -515,68 +592,87 @@ class DatabaseUpdate:
515
592
  df = pd.read_excel(os.path.join(root, name), header=0)
516
593
  if len(df) == 0:
517
594
  print(f'{name} 报表数据为空')
595
+ check_remove_file = True
518
596
  continue
519
597
  df.insert(loc=0, column='日期', value=start_date)
520
- elif name.endswith('.xls') and 'JD店铺日报_店铺' in name:
598
+ check_remove_file = True
599
+ elif name.endswith('.xls') and ('JD店铺日报_店铺' in name or '店铺_20' in name):
521
600
  # 京东 自助报表 店铺日报
522
601
  df = pd.read_excel(os.path.join(root, name), header=0)
523
602
  if len(df) == 0:
524
603
  print(f'{name} 报表数据为空')
604
+ check_remove_file = True
605
+ continue
606
+ if '访客数-全部渠道' not in df.columns.tolist(): # 识别是否真的京东日报
525
607
  continue
526
608
  df['日期'] = df['日期'].apply(
527
609
  lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', str(x))[0])
528
610
  )
611
+ check_remove_file = True
529
612
  elif name.endswith('.xls') and '商家榜单_女包_整体' in name:
530
613
  # 京东 行业 商家榜单
531
614
  date2 = re.findall(r'_\d{8}-\d+', name)
532
615
  if date2:
533
616
  print(f'{name}: 请下载日数据,不支持其他周期')
534
617
  # os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
618
+ check_remove_file = True
535
619
  continue
536
620
  df = pd.read_excel(os.path.join(root, name), header=0)
537
621
  if len(df) == 0:
538
622
  print(f'{name} 报表数据为空')
623
+ check_remove_file = True
539
624
  continue
540
625
  df['日期'] = df['日期'].astype(str).apply(lambda x: f'{x[:4]}-{x[4:6]}-{x[6:8]}')
541
626
  df.insert(loc=0, column='类型', value='商家榜单')
627
+ check_remove_file = True
542
628
  elif name.endswith('.xlsx') and '批量SKU导出-批量任务' in name:
543
629
  # 京东 sku 导出
544
630
  df = pd.read_excel(os.path.join(root, name), header=0)
545
631
  if len(df) == 0:
546
632
  print(f'{name} 报表数据为空')
633
+ check_remove_file = True
547
634
  continue
548
635
  d_time = datetime.datetime.today().strftime('%Y-%m-%d')
549
636
  df.insert(loc=0, column='日期', value=d_time)
550
637
  df['商品链接'] = df['商品链接'].apply(lambda x: f'https://{x}' if x else x)
638
+ check_remove_file = True
551
639
  elif name.endswith('.xlsx') and '批量SPU导出-批量任务' in name:
552
640
  # 京东 spu 导出
553
641
  df = pd.read_excel(os.path.join(root, name), header=0)
554
642
  if len(df) == 0:
555
643
  print(f'{name} 报表数据为空')
644
+ check_remove_file = True
556
645
  continue
557
646
  d_time = datetime.datetime.today().strftime('%Y-%m-%d')
558
647
  df.insert(loc=0, column='日期', value=d_time)
648
+ check_remove_file = True
559
649
  elif name.endswith('.csv') and '万里马箱包推广1_完整点击成交' in name:
560
650
  # 京东推广数据
561
651
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
562
652
  if len(df) == 0:
563
653
  print(f'{name} 报表数据为空')
654
+ check_remove_file = True
564
655
  continue
565
656
  df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
657
+ check_remove_file = True
566
658
  elif name.endswith('.csv') and '万里马箱包推广1_京东推广搜索词_pbix同步不要' in name:
567
659
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
568
660
  if len(df) == 0:
569
661
  print(f'{name} 报表数据为空')
662
+ check_remove_file = True
570
663
  continue
571
664
  df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
572
665
  df['是否品牌词'] = df['搜索词'].str.contains('万里马|wanlima', regex=True)
573
666
  df['是否品牌词'] = df['是否品牌词'].apply(lambda x: '品牌词' if x else '')
667
+ check_remove_file = True
574
668
  elif name.endswith('.xlsx') and '零售明细统计' in name:
575
669
  df = pd.read_excel(os.path.join(root, name), header=0)
576
670
  if len(df) == 0:
577
671
  print(f'{name} 报表数据为空')
672
+ check_remove_file = True
578
673
  continue
579
674
  df = df[df['缩略图'] != '合计']
675
+ check_remove_file = True
580
676
  elif name.endswith('.csv') and '营销概况_全站营销' in name:
581
677
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=1, na_filter=False)
582
678
  df = df[(df['日期'] != '日期') & (df['日期'] != '汇总') & (df['日期'] != '0') & (df['花费'] != '0') & (df['花费'] != '0.00')]
@@ -584,6 +680,7 @@ class DatabaseUpdate:
584
680
  df.drop("'当前时间'", axis=1, inplace=True)
585
681
  df.rename(columns={'全站ROI': '全站roi'}, inplace=True)
586
682
  df.insert(loc=1, column='产品线', value='全站营销')
683
+ check_remove_file = True
587
684
  elif name.endswith('.csv') and '关键词点击成交报表_pbix同步_勿删改' in name:
588
685
  df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
589
686
  for col in df.columns.tolist():
@@ -595,6 +692,7 @@ class DatabaseUpdate:
595
692
  df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
596
693
  # min_clm = str(df['日期'].min()).split(' ')[0]
597
694
  # max_clm = str(df['日期'].max()).split(' ')[0]
695
+ check_remove_file = True
598
696
 
599
697
  # 商品素材,必须保持放在最后处理
600
698
  elif name.endswith('xlsx'):
@@ -619,8 +717,9 @@ class DatabaseUpdate:
619
717
  collection_name = '商品素材导出'
620
718
  else:
621
719
  df = pd.DataFrame()
720
+ check_remove_file = True
622
721
 
623
- if is_move:
722
+ if is_move and check_remove_file:
624
723
  try:
625
724
  os.remove(os.path.join(root, name)) # 是否移除原文件
626
725
  except Exception as e:
mdbq/clean/data_clean.py CHANGED
@@ -895,13 +895,15 @@ class DataClean:
895
895
  m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_竞店监控_日数据')
896
896
  os.remove(os.path.join(root, name))
897
897
 
898
- elif name.endswith('.xls') and '店铺' in name:
898
+ elif name.endswith('.xls') and ('JD店铺日报_店铺' in name or '店铺_20' in name):
899
899
  # 京东 自助报表 店铺日报
900
900
  df = pd.read_excel(os.path.join(root, name), header=0)
901
901
  if len(df) == 0:
902
902
  print(f'{name} 报表数据为空')
903
903
  os.remove(os.path.join(root, name))
904
904
  continue
905
+ if '访客数-全部渠道' not in df.columns.tolist(): # 识别是否真的京东日报
906
+ continue
905
907
  df['日期'] = df['日期'].apply(
906
908
  lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', str(x))[0])
907
909
  )
mdbq/company/copysh.py CHANGED
@@ -377,3 +377,4 @@ if __name__ == '__main__':
377
377
  main()
378
378
  # # 聚合数据,并清理聚合数据
379
379
  # query_data.data_aggregation(service_databases=[{'company': 'mysql'}], months=1)
380
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mdbq
3
- Version: 1.9.5
3
+ Version: 1.9.7
4
4
  Home-page: https://pypi.org/project/mdbsql
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,7 +1,7 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
2
  mdbq/__version__.py,sha256=y9Mp_8x0BCZSHsdLT_q5tX9wZwd5QgqrSIENLrb6vXA,62
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
- mdbq/aggregation/aggregation.py,sha256=dQdaZZ8PD8uHY5opW9M6EIEONv-q_V-e_XtvITtJNrc,67166
4
+ mdbq/aggregation/aggregation.py,sha256=QAN378cXlkwonHUDSBYfdZRfHBuqft_HR7Vfr8l87-k,72085
5
5
  mdbq/aggregation/df_types.py,sha256=oQJS2IBU3_IO6GMgbssHuC2yCjNnbta0QPGrFOwNLnU,7591
6
6
  mdbq/aggregation/mysql_types.py,sha256=DQYROALDiwjJzjhaJfIIdnsrNs11i5BORlj_v6bp67Y,11062
7
7
  mdbq/aggregation/optimize_data.py,sha256=u2Kl_MFtZueXJ57ycy4H2OhXD431RctUYJYCl637uT0,4176
@@ -9,9 +9,9 @@ mdbq/aggregation/query_data.py,sha256=32NjVVYLnfFkzD8TflmNVhpdQTLRRUrb9toMGApSOC
9
9
  mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
10
10
  mdbq/bdup/bdup.py,sha256=LAV0TgnQpc-LB-YuJthxb0U42_VkPidzQzAagan46lU,4234
11
11
  mdbq/clean/__init__.py,sha256=A1d6x3L27j4NtLgiFV5TANwEkLuaDfPHDQNrPBbNWtU,41
12
- mdbq/clean/data_clean.py,sha256=VI_f9mQ2tHExBytQCCkajbPVpC0yNEOaVCtySe_OW40,100789
12
+ mdbq/clean/data_clean.py,sha256=y83uqOyM6nL0d3ClUqYMjE23ghBEkhz9uv19qrxA8NA,100980
13
13
  mdbq/company/__init__.py,sha256=qz8F_GsP_pMB5PblgJAUAMjasuZbOEp3qQOCB39E8f0,21
14
- mdbq/company/copysh.py,sha256=4PGjvmPzvrmstOaAwHQGFXIGCWqqNXZEOYf1QdUvMlI,17762
14
+ mdbq/company/copysh.py,sha256=VUaaJPXPYPHWwnkdK77PWz_dAXZyEmYBA9Df1yROHAc,17764
15
15
  mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
16
16
  mdbq/config/get_myconf.py,sha256=-CFEW0dQh4OIwVgwK-cL0eVp1LN3PjJgN89d4P5TB9I,6011
17
17
  mdbq/config/products.py,sha256=vIK8DJ-F3XXwvNPK-4OJq2tZITNlL6Sub8QBdoOng8U,5676
@@ -36,7 +36,7 @@ mdbq/pbix/__init__.py,sha256=Trtfaynu9RjoTyLLYBN2xdRxTvm_zhCniUkVTAYwcjo,24
36
36
  mdbq/pbix/pbix_refresh.py,sha256=JUjKW3bNEyoMVfVfo77UhguvS5AWkixvVhDbw4_MHco,2396
37
37
  mdbq/pbix/refresh_all.py,sha256=0uAnBKCd5cx5FLTkawN1GV9yi87rfyMgYal5LABtumQ,7186
38
38
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
39
- mdbq-1.9.5.dist-info/METADATA,sha256=qbeZPyNml9_seMx78A_nUdztJUVCi1xK8_E2MpdEu_4,245
40
- mdbq-1.9.5.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
41
- mdbq-1.9.5.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
42
- mdbq-1.9.5.dist-info/RECORD,,
39
+ mdbq-1.9.7.dist-info/METADATA,sha256=rqBOduo-xKxLyXbxt83RXob4dVYqlhdql_WL06TysmY,245
40
+ mdbq-1.9.7.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
41
+ mdbq-1.9.7.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
42
+ mdbq-1.9.7.dist-info/RECORD,,
File without changes