mdbq 1.2.4__py3-none-any.whl → 1.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/aggregation.py +57 -79
- mdbq/aggregation/query_data.py +26 -19
- mdbq/mysql/mysql.py +4 -3
- mdbq/mysql/s_query.py +3 -0
- {mdbq-1.2.4.dist-info → mdbq-1.2.6.dist-info}/METADATA +1 -1
- {mdbq-1.2.4.dist-info → mdbq-1.2.6.dist-info}/RECORD +8 -9
- {mdbq-1.2.4.dist-info → mdbq-1.2.6.dist-info}/WHEEL +1 -1
- mdbq/mysql/data_types_/345/215/263/345/260/206/345/210/240/351/231/244.py +0 -243
- {mdbq-1.2.4.dist-info → mdbq-1.2.6.dist-info}/top_level.txt +0 -0
mdbq/aggregation/aggregation.py
CHANGED
@@ -47,10 +47,29 @@ class DatabaseUpdate:
|
|
47
47
|
print(f'1.1.0 初始化时传入了不存在的目录: {self.path}')
|
48
48
|
return
|
49
49
|
|
50
|
+
filename = '标题对照表.csv'
|
51
|
+
support_file = set_support.SetSupport(dirname='support').dirname
|
52
|
+
if not os.path.isfile(os.path.join(support_file, filename)):
|
53
|
+
print(f'缺少关键文件支持: {os.path.join(support_file, filename)}')
|
54
|
+
return
|
55
|
+
df = pd.read_csv(os.path.join(support_file, filename), encoding='utf-8_sig', header=0, na_filter=False)
|
56
|
+
datas = df.to_dict('records') # 转字典
|
57
|
+
# print(datas)
|
58
|
+
|
50
59
|
for root, dirs, files in os.walk(self.path, topdown=False):
|
51
60
|
for name in files:
|
52
61
|
if '~$' in name or '.DS' in name or '.localized' in name or '.ini' in name or '$RECYCLE.BIN' in name or 'Icon' in name:
|
53
62
|
continue
|
63
|
+
|
64
|
+
db_name = None # 初始化/重置变量,避免进入下一个循环
|
65
|
+
collection_name = None
|
66
|
+
for data in datas: # 根据标题对照表适配 db_name 和 collection_name
|
67
|
+
if data['关键词1'] in name and data['关键词2'] in name:
|
68
|
+
db_name = data['数据库名']
|
69
|
+
collection_name = data['数据表']
|
70
|
+
# print(name, db_name, collection_name)
|
71
|
+
# return
|
72
|
+
|
54
73
|
# 只针对 csv, xlsx 文件进行处理
|
55
74
|
if not name.endswith('.csv') and not name.endswith('.xls') and not name.endswith('.xlsx'):
|
56
75
|
continue
|
@@ -107,8 +126,6 @@ class DatabaseUpdate:
|
|
107
126
|
shop_name = ''
|
108
127
|
# df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True) # 替换掉特殊字符
|
109
128
|
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
110
|
-
db_name = '推广数据2'
|
111
|
-
collection_name = '超级直播'
|
112
129
|
elif name.endswith('.xls') and '短直联投' in name:
|
113
130
|
# 短直联投
|
114
131
|
df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
|
@@ -117,8 +134,6 @@ class DatabaseUpdate:
|
|
117
134
|
print(f'{name} 报表数据为空')
|
118
135
|
continue
|
119
136
|
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
120
|
-
db_name = '推广数据2'
|
121
|
-
collection_name = '短直联投'
|
122
137
|
elif name.endswith('.xls') and '视频加速推广' in name:
|
123
138
|
# 超级短视频
|
124
139
|
df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
|
@@ -127,15 +142,11 @@ class DatabaseUpdate:
|
|
127
142
|
print(f'{name} 报表数据为空')
|
128
143
|
continue
|
129
144
|
# df.replace(to_replace=[''], value=0, regex=False, inplace=True)
|
130
|
-
db_name = '推广数据2'
|
131
|
-
collection_name = '超级短视频'
|
132
145
|
if '人群报表汇总' in name:
|
133
146
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=1, na_filter=False)
|
134
147
|
if len(df) == 0:
|
135
148
|
print(f'{name} 报表数据为空')
|
136
149
|
continue
|
137
|
-
db_name = '推广数据2'
|
138
|
-
collection_name = '达摩盘_dmp报表'
|
139
150
|
# ----------------- 推广报表 分割线 -----------------
|
140
151
|
# ----------------- 推广报表 分割线 -----------------
|
141
152
|
date01 = re.findall(r'(\d{4}-\d{2}-\d{2})_\d{4}-\d{2}-\d{2}', str(name))
|
@@ -183,8 +194,6 @@ class DatabaseUpdate:
|
|
183
194
|
if date01[0] != date02[0]:
|
184
195
|
data_lis = date01[0] + '_' + date02[0]
|
185
196
|
df.insert(loc=1, column='数据周期', value=data_lis)
|
186
|
-
db_name = '生意参谋2'
|
187
|
-
collection_name = '商品排行'
|
188
197
|
elif name.endswith('.xls') and '参谋店铺整体日报' in name:
|
189
198
|
# 自助取数,店铺日报
|
190
199
|
df = pd.read_excel(os.path.join(root, name), header=7)
|
@@ -192,8 +201,6 @@ class DatabaseUpdate:
|
|
192
201
|
print(f'{name} 报表数据为空')
|
193
202
|
continue
|
194
203
|
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
195
|
-
db_name = '生意参谋2'
|
196
|
-
collection_name = '自助取数_整体日报'
|
197
204
|
elif name.endswith('.xls') and '参谋每日流量_自助取数_新版' in name:
|
198
205
|
# 自助取数,每日流量
|
199
206
|
df = pd.read_excel(os.path.join(root, name), header=7)
|
@@ -211,8 +218,6 @@ class DatabaseUpdate:
|
|
211
218
|
else '智能场景' if x == '智能场景(原万相台)'
|
212
219
|
else x
|
213
220
|
)
|
214
|
-
db_name = '生意参谋2'
|
215
|
-
collection_name = '自助取数_每日流量'
|
216
221
|
elif name.endswith('.xls') and '商品sku' in name:
|
217
222
|
# 自助取数,商品sku
|
218
223
|
df = pd.read_excel(os.path.join(root, name), header=7)
|
@@ -225,8 +230,6 @@ class DatabaseUpdate:
|
|
225
230
|
'SKU ID': 'sku id',
|
226
231
|
'商品SKU': '商品sku',
|
227
232
|
}, inplace=True)
|
228
|
-
db_name = '生意参谋2'
|
229
|
-
collection_name = '自助取数_商品sku'
|
230
233
|
elif name.endswith('.xls') and '参谋店铺流量来源(月)' in name:
|
231
234
|
# 自助取数,月店铺流量来源
|
232
235
|
df = pd.read_excel(os.path.join(root, name), header=7)
|
@@ -245,8 +248,6 @@ class DatabaseUpdate:
|
|
245
248
|
else x
|
246
249
|
)
|
247
250
|
df['日期'] = df['数据周期'].apply(lambda x: re.findall('(.*) ~', x)[0])
|
248
|
-
db_name = '生意参谋2'
|
249
|
-
collection_name = '自助取数_店铺流量_月数据'
|
250
251
|
elif name.endswith('.csv') and 'baobei' in name:
|
251
252
|
# 生意经宝贝指标日数据
|
252
253
|
date = re.findall(r's-(\d{4})(\d{2})(\d{2})\.', str(name))
|
@@ -264,8 +265,6 @@ class DatabaseUpdate:
|
|
264
265
|
new_date = '-'.join(date[0])
|
265
266
|
df.insert(loc=0, column='日期', value=new_date)
|
266
267
|
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
267
|
-
db_name = '生意经2'
|
268
|
-
collection_name = '宝贝指标'
|
269
268
|
elif name.endswith('.csv') and '店铺销售指标' in name:
|
270
269
|
# 生意经, 店铺指标,仅限月数据,实际日指标也可以
|
271
270
|
name_st = re.findall(r'(.*)\(分日', name)
|
@@ -279,9 +278,7 @@ class DatabaseUpdate:
|
|
279
278
|
df['日期'] = df['日期'].astype(str).apply(
|
280
279
|
lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', x)[0]) if x else x)
|
281
280
|
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
282
|
-
|
283
|
-
collection_name = '店铺指标'
|
284
|
-
elif name.endswith('csv') and '省份' in name:
|
281
|
+
elif name.endswith('csv') and '省份城市分析' in name:
|
285
282
|
# 生意经,地域分布, 仅限日数据
|
286
283
|
pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})\.', name)
|
287
284
|
if not pattern or '省份城市分析2' not in name:
|
@@ -303,8 +300,6 @@ class DatabaseUpdate:
|
|
303
300
|
df.insert(loc=0, column='日期', value=date)
|
304
301
|
df['省份'] = pov
|
305
302
|
df['省+市'] = df[['省份', '城市']].apply(lambda x: f'{x["省份"]}-{x["城市"]}', axis=1)
|
306
|
-
db_name = '生意经2'
|
307
|
-
collection_name = '地域分布_省份城市分析'
|
308
303
|
elif name.endswith('csv') and 'order' in name:
|
309
304
|
# 生意经,订单数据,仅限月数据
|
310
305
|
pattern = re.findall(r'(.*)(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})', name)
|
@@ -328,8 +323,6 @@ class DatabaseUpdate:
|
|
328
323
|
df.rename(columns={'宝贝标题': '商品标题', '宝贝链接': '商品链接'}, inplace=True)
|
329
324
|
df['颜色编码'] = df['商家编码'].apply(
|
330
325
|
lambda x: ''.join(re.findall(r' .*(\d{4})$', str(x))) if x else x)
|
331
|
-
db_name = '生意经2'
|
332
|
-
collection_name = '订单数据'
|
333
326
|
elif name.endswith('.xlsx') and '直播间成交订单明细' in name:
|
334
327
|
# 直播间成交订单明细
|
335
328
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
@@ -338,8 +331,6 @@ class DatabaseUpdate:
|
|
338
331
|
continue
|
339
332
|
df.rename(columns={'场次ID': '场次id', '商品ID': '商品id'}, inplace=True)
|
340
333
|
df['日期'] = df['支付时间'].apply(lambda x: x.strftime('%Y-%m-%d'))
|
341
|
-
db_name = '生意参谋2'
|
342
|
-
collection_name = '直播间成交订单明细'
|
343
334
|
elif name.endswith('.xlsx') and '直播间大盘数据' in name:
|
344
335
|
# 直播间大盘数据
|
345
336
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
@@ -347,8 +338,6 @@ class DatabaseUpdate:
|
|
347
338
|
print(f'{name} 报表数据为空')
|
348
339
|
continue
|
349
340
|
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
350
|
-
db_name = '生意参谋2'
|
351
|
-
collection_name = '直播间大盘数据'
|
352
341
|
elif name.endswith('.xls') and '直播业绩-成交拆解' in name:
|
353
342
|
# 直播业绩-成交拆解
|
354
343
|
df = pd.read_excel(os.path.join(root, name), header=5)
|
@@ -356,44 +345,26 @@ class DatabaseUpdate:
|
|
356
345
|
print(f'{name} 报表数据为空')
|
357
346
|
continue
|
358
347
|
df.rename(columns={'统计日期': '日期'}, inplace=True)
|
359
|
-
db_name = '生意参谋2'
|
360
|
-
collection_name = '直播业绩'
|
361
348
|
elif name.endswith('.csv') and '淘宝店铺数据' in name:
|
362
349
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
363
|
-
db_name = '市场数据2'
|
364
|
-
collection_name = '淘宝店铺数据'
|
365
350
|
elif name.endswith('.csv') and '人群洞察' in name:
|
366
351
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
367
352
|
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
368
353
|
df = df[df['人群规模'] != '']
|
369
354
|
if len(df) == 0:
|
370
355
|
continue
|
371
|
-
db_name = '推广数据2'
|
372
|
-
collection_name = '万相台_人群洞察'
|
373
356
|
elif name.endswith('.csv') and '客户_客户概况_画像' in name:
|
374
357
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
375
|
-
db_name = '生意参谋2'
|
376
|
-
collection_name = '客户_客户概况_画像'
|
377
358
|
elif name.endswith('.csv') and '市场排行_店铺' in name:
|
378
359
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
379
|
-
db_name = '市场数据2'
|
380
|
-
collection_name = '市场排行_店铺'
|
381
360
|
elif name.endswith('.csv') and '类目洞察_属性分析' in name:
|
382
361
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
383
|
-
db_name = '市场数据2'
|
384
|
-
collection_name = '类目洞察_属性分析'
|
385
362
|
elif name.endswith('.csv') and '类目洞察_价格分析' in name:
|
386
363
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
387
|
-
db_name = '市场数据2'
|
388
|
-
collection_name = '类目洞察_价格分析'
|
389
364
|
elif name.endswith('.csv') and '竞店分析-销售分析' in name:
|
390
365
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
391
|
-
db_name = '市场数据2'
|
392
|
-
collection_name = '竞店分析_销售分析'
|
393
366
|
elif name.endswith('.csv') and '竞店分析-来源分析' in name:
|
394
367
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
395
|
-
db_name = '市场数据2'
|
396
|
-
collection_name = '竞店分析_来源分析'
|
397
368
|
# ----------------------- 京东数据处理分界线 -----------------------
|
398
369
|
# ----------------------- 京东数据处理分界线 -----------------------
|
399
370
|
elif name.endswith('.xlsx') and '店铺来源_流量来源' in name:
|
@@ -416,8 +387,6 @@ class DatabaseUpdate:
|
|
416
387
|
for col_2024 in cols: # 京东这个表有字段加了去年日期,删除这些同比数据字段,不然列数量爆炸
|
417
388
|
if '20' in col_2024 and '流量来源' in name:
|
418
389
|
df.drop(col_2024, axis=1, inplace=True)
|
419
|
-
db_name = '京东数据2'
|
420
|
-
collection_name = '流量来源_日数据'
|
421
390
|
elif name.endswith('.xlsx') and '全部渠道_商品明细' in name:
|
422
391
|
# 京东商品明细 文件转换
|
423
392
|
date1 = re.findall(r'_(\d{4})(\d{2})(\d{2})_全部', str(name))
|
@@ -439,7 +408,7 @@ class DatabaseUpdate:
|
|
439
408
|
df.rename(columns={'商品ID': '商品id'}, inplace=True)
|
440
409
|
df.insert(loc=0, column='日期', value=date1)
|
441
410
|
df['最近上架时间'].loc[0] = df['最近上架时间'].loc[1] # 填充这一列, 避免上传 mysql 日期类型报错
|
442
|
-
if 'sku' in new_name:
|
411
|
+
if 'sku' in new_name: # 即使有文件对照表,也不能删除这个条件,spu ,sku 是后来加的
|
443
412
|
db_name = '京东数据2'
|
444
413
|
collection_name = 'sku_商品明细'
|
445
414
|
elif 'spu' in new_name:
|
@@ -455,8 +424,6 @@ class DatabaseUpdate:
|
|
455
424
|
for col in ['词人气', '搜索点击率']:
|
456
425
|
if col in df.columns.tolist():
|
457
426
|
df[col] = df[col].apply(lambda x: round(x, 6) if x else x)
|
458
|
-
db_name = '京东数据2'
|
459
|
-
collection_name = '商品词下排名'
|
460
427
|
elif name.endswith('.xlsx') and '搜索分析-排名定位-商品排名' in name:
|
461
428
|
# 京东商品排名
|
462
429
|
date_in = re.findall(r'(\d{4}-\d{2}-\d{2})-搜索', str(name))[0]
|
@@ -468,8 +435,6 @@ class DatabaseUpdate:
|
|
468
435
|
df.rename(columns={'SKU': 'skuid'}, inplace=True)
|
469
436
|
if '点击率' in df.columns.tolist():
|
470
437
|
df['点击率'] = df['点击率'].apply(lambda x: round(x, 6) if x else x)
|
471
|
-
db_name = '京东数据2'
|
472
|
-
collection_name = '商品排名'
|
473
438
|
elif name.endswith('.xls') and '竞店概况_竞店详情' in name:
|
474
439
|
# 京东,竞争-竞店概况-竞店详情-全部渠道
|
475
440
|
date01 = re.findall(r'全部渠道_(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
|
@@ -480,9 +445,7 @@ class DatabaseUpdate:
|
|
480
445
|
print(f'{name} 报表数据为空')
|
481
446
|
continue
|
482
447
|
df.insert(loc=0, column='日期', value=start_date)
|
483
|
-
|
484
|
-
collection_name = '竞店监控_日数据'
|
485
|
-
elif name.endswith('.xls') and '店铺' in name:
|
448
|
+
elif name.endswith('.xls') and 'JD店铺日报_店铺' in name:
|
486
449
|
# 京东 自助报表 店铺日报
|
487
450
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
488
451
|
if len(df) == 0:
|
@@ -491,8 +454,6 @@ class DatabaseUpdate:
|
|
491
454
|
df['日期'] = df['日期'].apply(
|
492
455
|
lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', str(x))[0])
|
493
456
|
)
|
494
|
-
db_name = '京东数据2'
|
495
|
-
collection_name = '京东_自助取数_店铺日报'
|
496
457
|
elif name.endswith('.xls') and '商家榜单_女包_整体' in name:
|
497
458
|
# 京东 行业 商家榜单
|
498
459
|
date2 = re.findall(r'_\d{8}-\d+', name)
|
@@ -506,8 +467,6 @@ class DatabaseUpdate:
|
|
506
467
|
continue
|
507
468
|
df['日期'] = df['日期'].astype(str).apply(lambda x: f'{x[:4]}-{x[4:6]}-{x[6:8]}')
|
508
469
|
df.insert(loc=0, column='类型', value='商家榜单')
|
509
|
-
db_name = '京东数据2'
|
510
|
-
collection_name = '商家榜单'
|
511
470
|
elif name.endswith('.xlsx') and '批量SKU导出-批量任务' in name:
|
512
471
|
# 京东 sku 导出
|
513
472
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
@@ -517,8 +476,6 @@ class DatabaseUpdate:
|
|
517
476
|
d_time = datetime.datetime.today().strftime('%Y-%m-%d')
|
518
477
|
df.insert(loc=0, column='日期', value=d_time)
|
519
478
|
df['商品链接'] = df['商品链接'].apply(lambda x: f'https://{x}' if x else x)
|
520
|
-
db_name = '属性设置2'
|
521
|
-
collection_name = '京东sku商品信息'
|
522
479
|
elif name.endswith('.xlsx') and '批量SPU导出-批量任务' in name:
|
523
480
|
# 京东 spu 导出
|
524
481
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
@@ -527,8 +484,6 @@ class DatabaseUpdate:
|
|
527
484
|
continue
|
528
485
|
d_time = datetime.datetime.today().strftime('%Y-%m-%d')
|
529
486
|
df.insert(loc=0, column='日期', value=d_time)
|
530
|
-
db_name = '属性设置2'
|
531
|
-
collection_name = '京东spu商品信息'
|
532
487
|
elif name.endswith('.csv') and '万里马箱包推广1_完整点击成交' in name:
|
533
488
|
# 京东推广数据
|
534
489
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
@@ -536,8 +491,6 @@ class DatabaseUpdate:
|
|
536
491
|
print(f'{name} 报表数据为空')
|
537
492
|
continue
|
538
493
|
df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
|
539
|
-
db_name = '京东数据2'
|
540
|
-
collection_name = '推广数据_京准通'
|
541
494
|
elif name.endswith('.csv') and '万里马箱包推广1_京东推广搜索词_pbix同步不要' in name:
|
542
495
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
543
496
|
if len(df) == 0:
|
@@ -546,16 +499,12 @@ class DatabaseUpdate:
|
|
546
499
|
df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
|
547
500
|
df['是否品牌词'] = df['搜索词'].str.contains('万里马|wanlima', regex=True)
|
548
501
|
df['是否品牌词'] = df['是否品牌词'].apply(lambda x: '品牌词' if x else '')
|
549
|
-
db_name = '京东数据2'
|
550
|
-
collection_name = '推广数据_搜索词报表'
|
551
502
|
elif name.endswith('.xlsx') and '零售明细统计' in name:
|
552
503
|
df = pd.read_excel(os.path.join(root, name), header=0)
|
553
504
|
if len(df) == 0:
|
554
505
|
print(f'{name} 报表数据为空')
|
555
506
|
continue
|
556
507
|
df = df[df['缩略图'] != '合计']
|
557
|
-
db_name = '生意经2'
|
558
|
-
collection_name = 'e3_零售明细统计'
|
559
508
|
|
560
509
|
# 商品素材,必须保持放在最后处理
|
561
510
|
elif name.endswith('xlsx'):
|
@@ -599,6 +548,14 @@ class DatabaseUpdate:
|
|
599
548
|
for name in files:
|
600
549
|
if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
|
601
550
|
continue
|
551
|
+
|
552
|
+
db_name = None # 初始化/重置变量,避免进入下一个循环
|
553
|
+
collection_name = None
|
554
|
+
for data in datas: # 根据标题对照表适配 db_name 和 collection_name
|
555
|
+
if data['关键词1'] in name and data['关键词2'] in name:
|
556
|
+
db_name = data['数据库名']
|
557
|
+
collection_name = data['数据表']
|
558
|
+
|
602
559
|
# df = pd.DataFrame()
|
603
560
|
if name.endswith('.xlsx') and '明星店铺' in name:
|
604
561
|
# 品销宝
|
@@ -614,8 +571,6 @@ class DatabaseUpdate:
|
|
614
571
|
# print(f'{name}/{sheet4} 跳过')
|
615
572
|
continue
|
616
573
|
df.insert(loc=1, column='报表类型', value=sheet4)
|
617
|
-
db_name = '推广数据2'
|
618
|
-
collection_name = f'品销宝_{sheet4}'
|
619
574
|
self.datas.append(
|
620
575
|
{
|
621
576
|
'数据库名': db_name,
|
@@ -895,6 +850,8 @@ def upload_dir(path, db_name, collection_name, dbs={'mysql': True, 'mongodb': Tr
|
|
895
850
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
896
851
|
if len(df) == 0:
|
897
852
|
continue
|
853
|
+
# if '新版' not in name:
|
854
|
+
# continue
|
898
855
|
cv = converter.DataFrameConverter()
|
899
856
|
df = cv.convert_df_cols(df=df) # 清理列名和 df 中的非法字符
|
900
857
|
try:
|
@@ -996,14 +953,35 @@ def test():
|
|
996
953
|
# break
|
997
954
|
|
998
955
|
|
956
|
+
def test2():
|
957
|
+
dp = DatabaseUpdate(path='/Users/xigua/Downloads')
|
958
|
+
dp.new_unzip(is_move=True)
|
959
|
+
dp.cleaning(is_move=False, ) # 清洗数据, 存入 self.datas
|
960
|
+
dp.upload_df(service_databases=[
|
961
|
+
# {'home_lx': 'mongodb'},
|
962
|
+
{'home_lx': 'mysql'},
|
963
|
+
# {'nas': 'mysql'}
|
964
|
+
])
|
965
|
+
|
966
|
+
|
999
967
|
if __name__ == '__main__':
|
1000
968
|
# username, password, host, port = get_myconf.select_config_values(target_service='nas', database='mysql')
|
1001
969
|
# print(username, password, host, port)
|
1002
|
-
file_dir(one_file=False)
|
970
|
+
# file_dir(one_file=False)
|
1003
971
|
# one_file_to_mysql(
|
1004
|
-
# file='/Users/xigua/数据中心/原始文件2
|
1005
|
-
# db_name='
|
1006
|
-
# table_name='
|
972
|
+
# file='/Users/xigua/数据中心/原始文件2/推广报表/品销宝/账户/账户_明星店铺报表_2023-11-13_2023-12-12.csv',
|
973
|
+
# db_name='推广数据2',
|
974
|
+
# table_name='品销宝',
|
1007
975
|
# target_service='home_lx',
|
1008
976
|
# database='mysql'
|
1009
977
|
# )
|
978
|
+
# db_name = '生意参谋2'
|
979
|
+
# table_name = '店铺来源_日数据_新版'
|
980
|
+
# upload_dir(
|
981
|
+
# path='/Users/xigua/数据中心/原始文件2/生意参谋/流量来源',
|
982
|
+
# db_name=db_name,
|
983
|
+
# collection_name=table_name,
|
984
|
+
# dbs={'mysql': True, 'mongodb': False},
|
985
|
+
# )
|
986
|
+
|
987
|
+
test2()
|
mdbq/aggregation/query_data.py
CHANGED
@@ -166,7 +166,7 @@ class MysqlDatasQuery:
|
|
166
166
|
'加购人数': 1,
|
167
167
|
}
|
168
168
|
df = self.download.data_to_df(
|
169
|
-
db_name='
|
169
|
+
db_name='生意参谋2',
|
170
170
|
table_name='店铺来源_日数据',
|
171
171
|
start_date=start_date,
|
172
172
|
end_date=end_date,
|
@@ -240,6 +240,7 @@ class GroupBy:
|
|
240
240
|
'总成交金额': '成交金额'
|
241
241
|
}, inplace=True)
|
242
242
|
df = df.astype({
|
243
|
+
'商品id': str,
|
243
244
|
'花费': float,
|
244
245
|
'展现量': int,
|
245
246
|
'点击量': int,
|
@@ -263,12 +264,13 @@ class GroupBy:
|
|
263
264
|
)
|
264
265
|
else:
|
265
266
|
df = df.groupby(['日期', '营销场景', '商品id', '花费', '展现量', '点击量'], as_index=False).agg(
|
266
|
-
**{
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
267
|
+
**{
|
268
|
+
'加购量': ('加购量', np.min),
|
269
|
+
'成交笔数': ('成交笔数', np.min),
|
270
|
+
'成交金额': ('成交金额', np.min),
|
271
|
+
'自然流量曝光量': ('自然流量曝光量', np.min),
|
272
|
+
'直接成交笔数': ('直接成交笔数', np.max),
|
273
|
+
'直接成交金额': ('直接成交金额', np.max)
|
272
274
|
}
|
273
275
|
)
|
274
276
|
df.insert(loc=1, column='推广渠道', value='万相台无界版') # df中插入新列
|
@@ -276,10 +278,10 @@ class GroupBy:
|
|
276
278
|
**{
|
277
279
|
'花费': ('花费', np.sum),
|
278
280
|
'成交笔数': ('成交笔数', np.max),
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
281
|
+
'成交金额': ('成交金额', np.max),
|
282
|
+
'自然流量曝光量': ('自然流量曝光量', np.max),
|
283
|
+
'直接成交笔数': ('直接成交笔数', np.max),
|
284
|
+
'直接成交金额': ('直接成交金额', np.max)
|
283
285
|
}
|
284
286
|
)
|
285
287
|
self.data_tgyj.update(
|
@@ -290,6 +292,7 @@ class GroupBy:
|
|
290
292
|
return df
|
291
293
|
elif '宝贝指标' in table_name:
|
292
294
|
""" 聚合时不可以加商家编码,编码有些是空白,有些是 0 """
|
295
|
+
df['宝贝id'] = df['宝贝id'].astype(str)
|
293
296
|
df.fillna(0, inplace=True)
|
294
297
|
# df = df[(df['销售额'] != 0) | (df['退款额'] != 0)] # 注释掉, 因为后续使用生意经作为基准合并推广表,需确保所有商品id 齐全
|
295
298
|
df = df.groupby(['日期', '宝贝id', '行业类目'], as_index=False).agg(
|
@@ -320,6 +323,7 @@ class GroupBy:
|
|
320
323
|
elif '店铺来源_日数据' in table_name:
|
321
324
|
return df
|
322
325
|
elif '商品id编码表' in table_name:
|
326
|
+
df['宝贝id'] = df['宝贝id'].astype(str)
|
323
327
|
df.drop_duplicates(subset='宝贝id', keep='last', inplace=True, ignore_index=True)
|
324
328
|
# df['行业类目'] = df['行业类目'].apply(lambda x: re.sub(' ', '', x))
|
325
329
|
try:
|
@@ -359,6 +363,7 @@ class GroupBy:
|
|
359
363
|
table_name: df[['商品id', '商品图片']],
|
360
364
|
}
|
361
365
|
)
|
366
|
+
df['商品id'] = df['商品id'].astype(str)
|
362
367
|
return df
|
363
368
|
elif '商品成本' in table_name:
|
364
369
|
df.sort_values(by=['款号', '日期'], ascending=[False, True], ignore_index=True, inplace=True)
|
@@ -373,7 +378,7 @@ class GroupBy:
|
|
373
378
|
print(f'<{table_name}>: Groupby 类尚未配置,数据为空')
|
374
379
|
return pd.DataFrame({})
|
375
380
|
|
376
|
-
@try_except
|
381
|
+
# @try_except
|
377
382
|
def performance(self, bb_tg=True):
|
378
383
|
# print(self.data_tgyj)
|
379
384
|
tg, syj, idbm, pic, cost = (
|
@@ -390,13 +395,14 @@ class GroupBy:
|
|
390
395
|
df = pd.merge(tg, df, how='left', left_on='商品id', right_on='宝贝id')
|
391
396
|
df.drop(labels='宝贝id', axis=1, inplace=True)
|
392
397
|
if bb_tg is True:
|
393
|
-
|
398
|
+
# 生意经合并推广表,完整的数据表,包含全店所有推广、销售数据
|
394
399
|
df = pd.merge(syj, df, how='left', left_on=['日期', '宝贝id'], right_on=['日期', '商品id'])
|
395
400
|
else:
|
396
401
|
# 推广表合并生意经 , 以推广数据为基准,销售数据不齐全
|
397
402
|
df = pd.merge(df, syj, how='left', left_on=['日期', '商品id'], right_on=['日期', '宝贝id'])
|
398
403
|
df.drop(labels='宝贝id', axis=1, inplace=True)
|
399
404
|
df.drop_duplicates(subset=['日期', '商品id', '花费', '销售额'], keep='last', inplace=True, ignore_index=True)
|
405
|
+
df['成本价'] = df['成本价'].astype('float64')
|
400
406
|
df['商品成本'] = df.apply(lambda x: (x['成本价'] + x['销售额']/x['销售量'] * 0.11 + 6) * x['销售量'] if x['销售量'] > 0 else 0, axis=1)
|
401
407
|
df['商品毛利'] = df.apply(lambda x: x['销售额'] - x['商品成本'], axis=1)
|
402
408
|
df['毛利率'] = df.apply(lambda x: round((x['销售额'] - x['商品成本']) / x['销售额'], 4) if x['销售额'] > 0 else 0, axis=1)
|
@@ -493,12 +499,13 @@ def data_aggregation(service_databases=[{}]):
|
|
493
499
|
for service_database in service_databases:
|
494
500
|
for service_name, database in service_database.items():
|
495
501
|
sdq = MysqlDatasQuery(target_service=service_name) # 实例化数据处理类
|
496
|
-
sdq.months =
|
502
|
+
sdq.months = 0 # 设置数据周期, 1 表示近 2 个月
|
497
503
|
g = GroupBy() # 实例化数据聚合类
|
498
504
|
# 实例化数据库连接
|
499
505
|
username, password, host, port = get_myconf.select_config_values(target_service=service_name, database=database)
|
500
506
|
m = mysql.MysqlUpload(username=username, password=password, host=host, port=port)
|
501
507
|
|
508
|
+
# 从数据库中获取数据, 返回包含 df 数据的字典
|
502
509
|
data_dict = [
|
503
510
|
{
|
504
511
|
'数据库名': '聚合数据',
|
@@ -531,19 +538,19 @@ def data_aggregation(service_databases=[{}]):
|
|
531
538
|
'数据主体': sdq.sp_cost(),
|
532
539
|
},
|
533
540
|
]
|
534
|
-
for items in data_dict:
|
541
|
+
for items in data_dict: # 遍历返回结果
|
535
542
|
db_name, table_name, df = items['数据库名'], items['集合名'], items['数据主体']
|
536
543
|
df = g.groupby(df=df, table_name=table_name, is_maximize=True) # 2. 聚合数据
|
537
544
|
# g.as_csv(df=df, filename=table_name + '.csv')
|
538
|
-
m.df_to_mysql(df=df, db_name=db_name, table_name=table_name) # 3. 回传数据库
|
545
|
+
m.df_to_mysql(df=df, db_name=db_name, table_name=table_name, drop_dup=True) # 3. 回传数据库
|
539
546
|
res = g.performance(bb_tg=True) # 盈亏表,依赖其他表,单独做
|
540
|
-
m.df_to_mysql(df=res, db_name='聚合数据', table_name='_全店商品销售')
|
547
|
+
m.df_to_mysql(df=res, db_name='聚合数据', table_name='_全店商品销售', drop_dup=True)
|
541
548
|
res = g.performance(bb_tg=False) # 盈亏表,依赖其他表,单独做
|
542
|
-
m.df_to_mysql(df=res, db_name='聚合数据', table_name='_推广商品销售')
|
549
|
+
m.df_to_mysql(df=res, db_name='聚合数据', table_name='_推广商品销售', drop_dup=True)
|
543
550
|
|
544
551
|
# optimize_data.op_data(service_databases=service_databases, days=3650) # 立即启动对聚合数据的清理工作
|
545
552
|
|
546
553
|
|
547
554
|
if __name__ == '__main__':
|
548
|
-
data_aggregation(service_databases=[{'
|
555
|
+
data_aggregation(service_databases=[{'home_lx': 'mysql'}])
|
549
556
|
# optimize_data.op_data(service_databases=[{'company': 'mysql'}], days=3650) # 立即启动对聚合数据的清理工作
|
mdbq/mysql/mysql.py
CHANGED
@@ -65,7 +65,8 @@ class MysqlUpload:
|
|
65
65
|
将 df 写入数据库
|
66
66
|
db_name: 数据库名称
|
67
67
|
table_name: 集合/表名称
|
68
|
-
|
68
|
+
df_sql: 使用 df.to_sql 函数上传整个表, 不会排重
|
69
|
+
drop_duplicates:仅限于聚合数据使用,其他情况不要设置此参数
|
69
70
|
drop_dup: 值为 True 时检查重复数据再插入,反之直接上传
|
70
71
|
filename: 传这个参数是方便定位产生错误的文件
|
71
72
|
"""
|
@@ -152,7 +153,7 @@ class MysqlUpload:
|
|
152
153
|
|
153
154
|
if df_sql:
|
154
155
|
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
|
155
|
-
print(f'{now}正在更新 mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count},{self.filename}')
|
156
|
+
print(f'{now}正在更新 mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count}, {self.filename}')
|
156
157
|
engine = create_engine(
|
157
158
|
f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
|
158
159
|
df.to_sql(
|
@@ -185,7 +186,7 @@ class MysqlUpload:
|
|
185
186
|
|
186
187
|
# 5. 更新插入数据
|
187
188
|
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
|
188
|
-
print(f'{now}正在更新 mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count},{self.filename}')
|
189
|
+
print(f'{now}正在更新 mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count}, {self.filename}')
|
189
190
|
|
190
191
|
datas = df.to_dict(orient='records')
|
191
192
|
for data in datas:
|
mdbq/mysql/s_query.py
CHANGED
@@ -12,6 +12,7 @@ from sqlalchemy import create_engine
|
|
12
12
|
import os
|
13
13
|
import calendar
|
14
14
|
from mdbq.config import get_myconf
|
15
|
+
from mdbq.dataframe import converter
|
15
16
|
|
16
17
|
warnings.filterwarnings('ignore')
|
17
18
|
"""
|
@@ -84,6 +85,8 @@ class QueryDatas:
|
|
84
85
|
|
85
86
|
if len(df) == 0:
|
86
87
|
print(f'database: {db_name}, table: {table_name} 查询的数据为空')
|
88
|
+
cv = converter.DataFrameConverter()
|
89
|
+
df = cv.convert_df_cols(df)
|
87
90
|
return df
|
88
91
|
|
89
92
|
def columns_to_list(self, db_name, table_name, columns_name) -> list:
|
@@ -1,11 +1,11 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
2
|
mdbq/__version__.py,sha256=y9Mp_8x0BCZSHsdLT_q5tX9wZwd5QgqrSIENLrb6vXA,62
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/aggregation.py,sha256=
|
4
|
+
mdbq/aggregation/aggregation.py,sha256=mBgIY7afloW8H5qoBy56vCabIQRxVvAhrRZgGbZUxFQ,55791
|
5
5
|
mdbq/aggregation/df_types.py,sha256=rHLIgv82PJSFmDvXkZyOJAffXkFyyMyFO23w9tUt8EQ,7525
|
6
6
|
mdbq/aggregation/mysql_types.py,sha256=umVixmbFZM63k-QhVWLvOuhcAde4P_oDKbdo8ry2O9w,10633
|
7
7
|
mdbq/aggregation/optimize_data.py,sha256=jLAWtxPUuhpo4XTVrhKtT4xK3grs7r73ePQfLhxlu1I,779
|
8
|
-
mdbq/aggregation/query_data.py,sha256=
|
8
|
+
mdbq/aggregation/query_data.py,sha256=fg_9OdNSwHbo9vhK1pAKOazHFHZfE9_rBxRyQIWJX9U,25694
|
9
9
|
mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
|
10
10
|
mdbq/bdup/bdup.py,sha256=LAV0TgnQpc-LB-YuJthxb0U42_VkPidzQzAagan46lU,4234
|
11
11
|
mdbq/clean/__init__.py,sha256=A1d6x3L27j4NtLgiFV5TANwEkLuaDfPHDQNrPBbNWtU,41
|
@@ -24,9 +24,8 @@ mdbq/log/mylogger.py,sha256=oaT7Bp-Hb9jZt52seP3ISUuxVcI19s4UiqTeouScBO0,3258
|
|
24
24
|
mdbq/mongo/__init__.py,sha256=SILt7xMtQIQl_m-ik9WLtJSXIVf424iYgCfE_tnQFbw,13
|
25
25
|
mdbq/mongo/mongo.py,sha256=v9qvrp6p1ZRWuPpbSilqveiE0FEcZF7U5xUPI0RN4xs,31880
|
26
26
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
27
|
-
mdbq/mysql/
|
28
|
-
mdbq/mysql/
|
29
|
-
mdbq/mysql/s_query.py,sha256=4c24SwbqtnO33o8CgWlTQ_j8sZYl5BRIQkaD9CI-vTY,7901
|
27
|
+
mdbq/mysql/mysql.py,sha256=KvUQflP5sYOECTHOs2Fs9ABcQvgPCbBnAX2ZlE3JjgY,37544
|
28
|
+
mdbq/mysql/s_query.py,sha256=a33aYhW6gAnspIZfQ7l23ePln9-MD1f_ukypr5M0jd8,8018
|
30
29
|
mdbq/mysql/year_month_day.py,sha256=VgewoE2pJxK7ErjfviL_SMTN77ki8GVbTUcao3vFUCE,1523
|
31
30
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
32
31
|
mdbq/other/porxy.py,sha256=UHfgEyXugogvXgsG68a7QouUCKaohTKKkI4RN-kYSdQ,4961
|
@@ -36,7 +35,7 @@ mdbq/pbix/__init__.py,sha256=Trtfaynu9RjoTyLLYBN2xdRxTvm_zhCniUkVTAYwcjo,24
|
|
36
35
|
mdbq/pbix/pbix_refresh.py,sha256=JUjKW3bNEyoMVfVfo77UhguvS5AWkixvVhDbw4_MHco,2396
|
37
36
|
mdbq/pbix/refresh_all.py,sha256=tgy762608HMaXWynbOURIf2UVMuSPybzrDXQnOOcnZU,6102
|
38
37
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
39
|
-
mdbq-1.2.
|
40
|
-
mdbq-1.2.
|
41
|
-
mdbq-1.2.
|
42
|
-
mdbq-1.2.
|
38
|
+
mdbq-1.2.6.dist-info/METADATA,sha256=_s1z5j_Q_dSi4lrw46NcpwMlgz5TkZnndOmWp4290Mk,245
|
39
|
+
mdbq-1.2.6.dist-info/WHEEL,sha256=cpQTJ5IWu9CdaPViMhC9YzF8gZuS5-vlfoFihTBC86A,91
|
40
|
+
mdbq-1.2.6.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
41
|
+
mdbq-1.2.6.dist-info/RECORD,,
|
@@ -1,243 +0,0 @@
|
|
1
|
-
# -*- coding:utf-8 -*-
|
2
|
-
import warnings
|
3
|
-
import pandas as pd
|
4
|
-
import numpy as np
|
5
|
-
import chardet
|
6
|
-
import zipfile
|
7
|
-
|
8
|
-
from numpy import dtype
|
9
|
-
from pandas.tseries.holiday import next_monday
|
10
|
-
from pyzipper import PyZipFile
|
11
|
-
import os
|
12
|
-
import platform
|
13
|
-
import json
|
14
|
-
import pymysql
|
15
|
-
from mdbq.mongo import mongo
|
16
|
-
from mdbq.mysql import mysql
|
17
|
-
from mdbq.mysql import s_query
|
18
|
-
from mdbq.config import get_myconf
|
19
|
-
from mdbq.config import set_support
|
20
|
-
from mdbq.dataframe import converter
|
21
|
-
import datetime
|
22
|
-
import time
|
23
|
-
import re
|
24
|
-
import shutil
|
25
|
-
import getpass
|
26
|
-
|
27
|
-
from sqlalchemy.dialects.postgresql.pg_catalog import pg_get_serial_sequence
|
28
|
-
|
29
|
-
warnings.filterwarnings('ignore')
|
30
|
-
"""
|
31
|
-
1. 记录 dataframe 或者数据库的列信息(dtypes)
|
32
|
-
2. 更新 mysql 中所有数据库的 dtypes 信息到本地 json
|
33
|
-
"""
|
34
|
-
|
35
|
-
|
36
|
-
class DataTypes:
|
37
|
-
"""
|
38
|
-
数据简介: 记录 dataframe 或者数据库的列信息(dtypes),可以记录其信息或者加载相关信息用于入库使用,
|
39
|
-
第一字段为分类(如 dataframe/mysql),第二字段为数据库名,第三字段为集合名,第四段列名及其数据类型
|
40
|
-
"""
|
41
|
-
def __init__(self):
|
42
|
-
self.datas = {
|
43
|
-
'_json统计':
|
44
|
-
{
|
45
|
-
'分类': 0,
|
46
|
-
'数据库量': 0,
|
47
|
-
'集合数量': 0,
|
48
|
-
'字段量': 0,
|
49
|
-
'数据简介': '记录 dataframe 或者数据库的列信息(dtypes)',
|
50
|
-
}
|
51
|
-
}
|
52
|
-
self.path = set_support.SetSupport(dirname='support').dirname
|
53
|
-
self.json_file = os.path.join(self.path, 'mysql_types.json')
|
54
|
-
if not os.path.isdir(self.path):
|
55
|
-
os.makedirs(self.path)
|
56
|
-
if not os.path.isfile(self.json_file):
|
57
|
-
with open(self.json_file, 'w', encoding='utf-8_sig') as f:
|
58
|
-
json.dump(self.datas, f, ensure_ascii=False, sort_keys=True, indent=4)
|
59
|
-
|
60
|
-
def json_before(self):
|
61
|
-
""" 本地 json 文件的 dtypes 信息, 初始化更新给 self.datas """
|
62
|
-
with open(self.json_file, 'r', encoding='utf-8_sig') as f:
|
63
|
-
json_ = json.load(f)
|
64
|
-
self.datas.update(json_)
|
65
|
-
|
66
|
-
def df_dtypes_to_json(self, db_name, collection_name, path, df=pd.DataFrame(), is_file_dtype=True):
|
67
|
-
if len(df) == 0:
|
68
|
-
return
|
69
|
-
cv = converter.DataFrameConverter()
|
70
|
-
df = cv.convert_df_cols(df=df) # 清理 dataframe 列名的不合规字符
|
71
|
-
dtypes = df.dtypes.apply(str).to_dict()
|
72
|
-
dtypes = {'dataframe': {db_name: {collection_name: dtypes}}}
|
73
|
-
self.dtypes_to_json(dtypes=dtypes, cl='dataframe', db_name=db_name, collection_name=collection_name, path=path, is_file_dtype=is_file_dtype)
|
74
|
-
|
75
|
-
def dtypes_to_json(self, cl, dtypes, db_name, collection_name, path, is_file_dtype, ):
|
76
|
-
""" 更新 dataframe 的 dtypes 信息到 json 文件 """
|
77
|
-
if not os.path.exists(path):
|
78
|
-
os.makedirs(path)
|
79
|
-
json_file = os.path.join(path, 'mysql_types.json')
|
80
|
-
if os.path.isfile(json_file):
|
81
|
-
self.json_before(json_file=json_file) # 更新本地json信息到 self.datas
|
82
|
-
|
83
|
-
if not os.path.isfile(json_file): # 如果不存在本地 json 文件, 直接返回即可
|
84
|
-
self.datas.update(dtypes)
|
85
|
-
with open(json_file, 'w', encoding='utf-8_sig') as f:
|
86
|
-
json.dump(self.datas, f, ensure_ascii=False, sort_keys=True, indent=4)
|
87
|
-
else: # 存在则读取,并更新 df 的 dtypes
|
88
|
-
if cl in self.datas.keys():
|
89
|
-
if db_name in list(self.datas[cl].keys()): # ['京东数据2', '天猫数据2', '生意参谋数据2', '生意经2']
|
90
|
-
if collection_name in list(self.datas[cl][db_name].keys()):
|
91
|
-
if is_file_dtype: # 旧数据优先
|
92
|
-
# # 用 dtypes 更新, 允许手动指定 json 文件里面的数据类型
|
93
|
-
dtypes[cl][db_name][collection_name].update(self.datas[cl][db_name][collection_name])
|
94
|
-
# 将 dtypes 更新进去,使 self.datas 包含新旧信息
|
95
|
-
self.datas[cl][db_name][collection_name].update(dtypes[cl][db_name][collection_name])
|
96
|
-
else: # 新数据优先
|
97
|
-
self.datas[cl][db_name][collection_name].update(dtypes[cl][db_name][collection_name])
|
98
|
-
else:
|
99
|
-
if is_file_dtype: # 旧数据优先
|
100
|
-
dtypes[cl][db_name].update(self.datas[cl][db_name])
|
101
|
-
self.datas[cl][db_name].update(dtypes[cl][db_name])
|
102
|
-
else:
|
103
|
-
self.datas[cl][db_name].update(dtypes[cl][db_name])
|
104
|
-
else:
|
105
|
-
# dtypes.update(self.datas) # 可以注释掉, 因为旧数据 self.datas 是空的
|
106
|
-
self.datas[cl].update(dtypes[cl])
|
107
|
-
else:
|
108
|
-
self.datas.update(dtypes)
|
109
|
-
|
110
|
-
cif = 0 # 分类
|
111
|
-
dbs = 0 # 数据库
|
112
|
-
collections = 0 # 集合
|
113
|
-
cols = 0 # 字段
|
114
|
-
for k, v in self.datas.items():
|
115
|
-
if k == '_json统计':
|
116
|
-
continue # 不统计头信息
|
117
|
-
cif += 1
|
118
|
-
for t, g in v.items():
|
119
|
-
dbs += 1
|
120
|
-
for d, j in g.items():
|
121
|
-
collections += 1
|
122
|
-
for t, p in j.items():
|
123
|
-
cols += 1
|
124
|
-
tips = {'分类': cif, '数据库量': dbs, '集合数量': collections, '字段量': cols}
|
125
|
-
self.datas['_json统计'].update(tips)
|
126
|
-
with open(json_file, 'w', encoding='utf-8_sig') as f:
|
127
|
-
json.dump(
|
128
|
-
self.datas,
|
129
|
-
f,
|
130
|
-
ensure_ascii=False, # 默认True,非ASCII字符将被转义。如为False,则非ASCII字符会以\uXXXX输出
|
131
|
-
sort_keys=True, # 默认为False。如果为True,则字典的输出将按键排序。
|
132
|
-
indent=4,
|
133
|
-
)
|
134
|
-
|
135
|
-
def mysql_dtypes_to_json(self, db_name, tabel_name, path, is_file_dtype=True):
|
136
|
-
username, password, host, port = get_myconf.select_config_values(
|
137
|
-
target_service='home_lx',
|
138
|
-
database='mysql',
|
139
|
-
)
|
140
|
-
sq = s_query.QueryDatas(username=username, password=password, host=host, port=port)
|
141
|
-
name_type = sq.dtypes_to_list(db_name=db_name, tabel_name=tabel_name)
|
142
|
-
if name_type:
|
143
|
-
dtypes = {item['COLUMN_NAME']: item['COLUMN_TYPE'] for item in name_type}
|
144
|
-
dtypes = {'mysql': {db_name: {tabel_name: dtypes}}}
|
145
|
-
self.dtypes_to_json(dtypes=dtypes, cl='mysql', db_name=db_name, collection_name=tabel_name, path=path, is_file_dtype=is_file_dtype)
|
146
|
-
else:
|
147
|
-
print(f'数据库回传数据(name_type)为空')
|
148
|
-
|
149
|
-
def load_dtypes(self, db_name, collection_name, path, cl='dataframe', ):
|
150
|
-
if os.path.isfile(path):
|
151
|
-
self.json_before(json_file=path) # 更新本地json信息到 self.datas
|
152
|
-
elif os.path.isdir(path):
|
153
|
-
json_file = os.path.join(path, 'mysql_types.json')
|
154
|
-
if os.path.isfile(json_file):
|
155
|
-
self.json_before(json_file=json_file)
|
156
|
-
else:
|
157
|
-
# 如果不存在,则新建文件
|
158
|
-
with open(json_file, 'w', encoding='utf-8_sig') as f:
|
159
|
-
json.dump(self.datas, f, ensure_ascii=False, sort_keys=True, indent=4)
|
160
|
-
# print(f'不存在的文件: {json_file}')
|
161
|
-
return
|
162
|
-
|
163
|
-
if cl in self.datas.keys():
|
164
|
-
if db_name in list(self.datas[cl].keys()):
|
165
|
-
if collection_name in list(self.datas[cl][db_name].keys()):
|
166
|
-
return self.datas[cl][db_name][collection_name]
|
167
|
-
else:
|
168
|
-
print(f'不存在的集合名信息: {collection_name}, 文件位置: {json_file}')
|
169
|
-
return {}
|
170
|
-
else:
|
171
|
-
print(f'不存在的数据库信息: {db_name}, 文件位置: {json_file}')
|
172
|
-
return {}
|
173
|
-
else:
|
174
|
-
print(f'不存在的数据分类: {cl}, 文件位置: {json_file}')
|
175
|
-
return {}
|
176
|
-
|
177
|
-
|
178
|
-
def mysql_all_dtypes(path=None):
|
179
|
-
"""
|
180
|
-
更新笔记本 mysql 中所有数据库的 dtypes 信息到本地 json
|
181
|
-
"""
|
182
|
-
if not path:
|
183
|
-
path = set_support.SetSupport(dirname='support').dirname
|
184
|
-
|
185
|
-
username, password, host, port = get_myconf.select_config_values(target_service='home_lx', database='mysql')
|
186
|
-
config = {
|
187
|
-
'host': host,
|
188
|
-
'port': port,
|
189
|
-
'user': username,
|
190
|
-
'password': password,
|
191
|
-
'charset': 'utf8mb4', # utf8mb4 支持存储四字节的UTF-8字符集
|
192
|
-
'cursorclass': pymysql.cursors.DictCursor,
|
193
|
-
}
|
194
|
-
|
195
|
-
connection = pymysql.connect(**config) # 连接数据库
|
196
|
-
with connection.cursor() as cursor:
|
197
|
-
sql = "SHOW DATABASES;"
|
198
|
-
cursor.execute(sql)
|
199
|
-
db_name_lists = cursor.fetchall()
|
200
|
-
db_name_lists = [item['Database'] for item in db_name_lists]
|
201
|
-
connection.close()
|
202
|
-
|
203
|
-
sys_lists = ['information_schema', 'mysql', 'performance_schema', 'sakila', 'sys']
|
204
|
-
db_name_lists = [item for item in db_name_lists if item not in sys_lists]
|
205
|
-
|
206
|
-
# db_name_lists = [
|
207
|
-
# '京东数据2',
|
208
|
-
# '天猫数据2',
|
209
|
-
# '市场数据2',
|
210
|
-
# '生意参谋数据2',
|
211
|
-
# '生意经2',
|
212
|
-
# '属性设置2',
|
213
|
-
# '聚合数据',
|
214
|
-
# ]
|
215
|
-
results = []
|
216
|
-
for db_name in db_name_lists:
|
217
|
-
config.update({'database': db_name}) # 添加更新 config 字段
|
218
|
-
connection = pymysql.connect(**config) # 连接数据库
|
219
|
-
try:
|
220
|
-
with connection.cursor() as cursor:
|
221
|
-
sql = f"SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = '{db_name}';"
|
222
|
-
sql = "SHOW TABLES;"
|
223
|
-
cursor.execute(sql)
|
224
|
-
table_name = cursor.fetchall()
|
225
|
-
for table in table_name:
|
226
|
-
for k, v in table.items():
|
227
|
-
results.append({db_name: v})
|
228
|
-
except:
|
229
|
-
pass
|
230
|
-
finally:
|
231
|
-
connection.close()
|
232
|
-
time.sleep(0.5)
|
233
|
-
|
234
|
-
d = DataTypes()
|
235
|
-
for result in results:
|
236
|
-
for k, v in result.items():
|
237
|
-
d.mysql_dtypes_to_json(db_name=k, tabel_name=v, path=path)
|
238
|
-
|
239
|
-
|
240
|
-
if __name__ == '__main__':
|
241
|
-
# mysql_all_dtypes() # 更新 mysql 中所有数据库的 dtypes 信息到本地 json
|
242
|
-
d = DataTypes()
|
243
|
-
|
File without changes
|