mdbq 3.3.14__py3-none-any.whl → 3.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/datashow.py +380 -46
- mdbq/aggregation/query_data.py +45 -15
- {mdbq-3.3.14.dist-info → mdbq-3.3.16.dist-info}/METADATA +1 -1
- {mdbq-3.3.14.dist-info → mdbq-3.3.16.dist-info}/RECORD +6 -6
- {mdbq-3.3.14.dist-info → mdbq-3.3.16.dist-info}/WHEEL +0 -0
- {mdbq-3.3.14.dist-info → mdbq-3.3.16.dist-info}/top_level.txt +0 -0
mdbq/aggregation/datashow.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# -*- coding: UTF-8 –*-
|
2
2
|
import os
|
3
|
+
import re
|
3
4
|
import socket
|
4
5
|
import platform
|
5
6
|
import datetime
|
@@ -118,7 +119,7 @@ class DataShow:
|
|
118
119
|
df = pd.concat(__res, ignore_index=True)
|
119
120
|
return df
|
120
121
|
|
121
|
-
def
|
122
|
+
def dpll_bak(self, db_name='聚合数据', table_name='店铺流量来源构成', pro_list=None, filename='店铺流量来源'):
|
122
123
|
if not pro_list:
|
123
124
|
pro_list = ['日期', '三级来源', '访客数']
|
124
125
|
df = self.getdata(db_name=db_name, table_name=table_name, pro_list=pro_list, start_date='2024-11-01', end_date=self.end_date)
|
@@ -131,7 +132,7 @@ class DataShow:
|
|
131
132
|
|
132
133
|
def st_date(num=1):
|
133
134
|
return pd.to_datetime(today - datetime.timedelta(days=num))
|
134
|
-
|
135
|
+
max_date = df['日期'].max().strftime('%Y-%m-%d')
|
135
136
|
df1 = df[df['日期'] >= st_date(1)]
|
136
137
|
df2 = df[df['日期'] >= st_date(7)]
|
137
138
|
df3 = df[df['日期'] >= st_date(30)]
|
@@ -184,7 +185,7 @@ class DataShow:
|
|
184
185
|
fig.add_annotation(
|
185
186
|
text=f'最近{pie_title[i]}天',
|
186
187
|
x=0.15 + 0.35 * (i - 1),
|
187
|
-
y=0.
|
188
|
+
y=0.98,
|
188
189
|
xref='paper', # # 相对于整个图表区域
|
189
190
|
yref='paper',
|
190
191
|
showarrow=True, # 显示箭头
|
@@ -193,7 +194,7 @@ class DataShow:
|
|
193
194
|
)
|
194
195
|
i += 1
|
195
196
|
fig.update_layout(
|
196
|
-
title_text='店铺流量来源',
|
197
|
+
title_text=f'店铺流量来源 最近一天: {max_date}',
|
197
198
|
xaxis_title='X Axis',
|
198
199
|
yaxis_title='Y Axis',
|
199
200
|
# width=self.screen_width // 1.4,
|
@@ -212,6 +213,140 @@ class DataShow:
|
|
212
213
|
fig = make_sub(data_list=data_list, num=3)
|
213
214
|
fig.write_html(os.path.join(self.path, f'{filename}.html'))
|
214
215
|
|
216
|
+
def dpll(self, db_name='聚合数据', table_name='店铺流量来源构成', pro_list=None, filename='店铺流量来源'):
|
217
|
+
if not pro_list:
|
218
|
+
pro_list = ['日期', '店铺名称', '类别', '来源构成', '二级来源', '三级来源', '访客数']
|
219
|
+
df = self.getdata(db_name=db_name, table_name=table_name, pro_list=pro_list, start_date='2024-11-01', end_date=self.end_date)
|
220
|
+
if len(df) == 0:
|
221
|
+
print(f'数据不能为空: {table_name}')
|
222
|
+
return
|
223
|
+
df['日期'] = pd.to_datetime(df['日期'])
|
224
|
+
df = df[
|
225
|
+
(df['店铺名称'] == '万里马官方旗舰店') &
|
226
|
+
(df['类别'] == '非全站推广期') &
|
227
|
+
(df['来源构成'] == '商品流量')
|
228
|
+
]
|
229
|
+
today = datetime.date.today()
|
230
|
+
|
231
|
+
def st_date(num=1):
|
232
|
+
return pd.to_datetime(today - datetime.timedelta(days=num))
|
233
|
+
max_date = df['日期'].max().strftime('%Y-%m-%d')
|
234
|
+
|
235
|
+
data_list = []
|
236
|
+
for days in [1, 7, 30]:
|
237
|
+
df_linshi = df[df['日期'] >= st_date(num=days)]
|
238
|
+
# 统计三级来源
|
239
|
+
df_linshi3 = df_linshi[df_linshi['二级来源'] != '汇总']
|
240
|
+
th_list = df_linshi3.groupby(['日期', '店铺名称', '类别', '来源构成', '二级来源']).size()
|
241
|
+
th_list = th_list.reset_index()
|
242
|
+
th_list = th_list[th_list[0] > 1]
|
243
|
+
th_list = th_list['二级来源'].tolist()
|
244
|
+
df_linshi3['三级来源'] = df_linshi3.apply(lambda x: x['三级来源'] if x['三级来源'] != '汇总' else '' if x['三级来源'] == '汇总' and x['二级来源'] in th_list else x['二级来源'], axis=1)
|
245
|
+
df_linshi3 = df_linshi3[df_linshi3['三级来源'] != '']
|
246
|
+
df_linshi3 = df_linshi3.groupby(['三级来源'], as_index=False).agg(**{'访客数': ('访客数', np.sum)})
|
247
|
+
|
248
|
+
df_linshi2 = df_linshi[(df_linshi['二级来源'] != '汇总') & (df_linshi['三级来源'] == '汇总')]
|
249
|
+
df_linshi2 = df_linshi2.groupby(['二级来源'], as_index=False).agg(**{'访客数': ('访客数', np.sum)})
|
250
|
+
data_list.append({'来源类型': '三级来源', '统计周期': days, '数据主体': df_linshi3})
|
251
|
+
data_list.append({'来源类型': '二级来源', '统计周期': days, '数据主体': df_linshi2})
|
252
|
+
# print(data_list)
|
253
|
+
t_p1 = []
|
254
|
+
for i in range(3):
|
255
|
+
t_p1.extend([{"type": "pie"}]) # 折线图类型
|
256
|
+
t_p2 = []
|
257
|
+
for i in range(3):
|
258
|
+
t_p2.extend([{"type": "pie"}]) # 饼图类型
|
259
|
+
specs = [t_p1, t_p2]
|
260
|
+
fig = make_subplots(rows=2, cols=3, specs=specs)
|
261
|
+
|
262
|
+
count1 = 0
|
263
|
+
count2 = 0
|
264
|
+
for item in data_list:
|
265
|
+
labels = item['数据主体'][item['来源类型']].tolist()
|
266
|
+
values = item['数据主体']['访客数'].tolist()
|
267
|
+
# 计算每个扇区的百分比,并找出哪些扇区应该被保留
|
268
|
+
total = sum(values)
|
269
|
+
# 计算每个扇区的百分比,并找出哪些扇区应该被保留
|
270
|
+
threshold_percentage = 1 # 阈值百分比
|
271
|
+
filtered_indices = [i for i, value in enumerate(values) if
|
272
|
+
(value / total) * 100 >= threshold_percentage]
|
273
|
+
# 提取被保留的扇区的标签和值
|
274
|
+
filtered_labels = [labels[i] for i in filtered_indices]
|
275
|
+
filtered_values = [values[i] for i in filtered_indices]
|
276
|
+
if item['来源类型'] == '二级来源':
|
277
|
+
# 添加饼图
|
278
|
+
fig.add_trace(
|
279
|
+
go.Pie(
|
280
|
+
labels=filtered_labels,
|
281
|
+
values=filtered_values,
|
282
|
+
name=item['来源类型'],
|
283
|
+
textinfo='label+percent'
|
284
|
+
),
|
285
|
+
row=1,
|
286
|
+
col=count1+1,
|
287
|
+
)
|
288
|
+
x = 0.14 + 0.355 * (count1)
|
289
|
+
y = 0.98
|
290
|
+
fig.add_annotation(
|
291
|
+
text=f'{item['来源类型']} 最近{item['统计周期']}天',
|
292
|
+
x=x,
|
293
|
+
y=y,
|
294
|
+
xref='paper', # # 相对于整个图表区域
|
295
|
+
yref='paper',
|
296
|
+
showarrow=True, # 显示箭头
|
297
|
+
align="left", # 文本对齐方式
|
298
|
+
font=dict(size=14),
|
299
|
+
)
|
300
|
+
count1 += 1
|
301
|
+
else:
|
302
|
+
# 添加饼图
|
303
|
+
fig.add_trace(
|
304
|
+
go.Pie(
|
305
|
+
labels=filtered_labels,
|
306
|
+
values=filtered_values,
|
307
|
+
name=item['来源类型'],
|
308
|
+
textinfo='label+percent'
|
309
|
+
),
|
310
|
+
row=2,
|
311
|
+
col=count2+1,
|
312
|
+
)
|
313
|
+
x = 0.12 + 0.39 * (count2 % 3)
|
314
|
+
y = -0.12
|
315
|
+
fig.add_annotation(
|
316
|
+
text=f'{item['来源类型']} 最近{item['统计周期']}天',
|
317
|
+
x=x,
|
318
|
+
y=y,
|
319
|
+
xref='paper', # # 相对于整个图表区域
|
320
|
+
yref='paper',
|
321
|
+
showarrow=False, # 显示箭头
|
322
|
+
align="left", # 文本对齐方式
|
323
|
+
font=dict(size=14),
|
324
|
+
)
|
325
|
+
count2 += 1
|
326
|
+
fig.update_layout(
|
327
|
+
title_text=f'店铺流量来源 最近数据: {max_date}',
|
328
|
+
# xaxis_title='X Axis',
|
329
|
+
# yaxis_title='Y Axis',
|
330
|
+
# width=self.screen_width // 1.4,
|
331
|
+
# height=self.screen_width // 2,
|
332
|
+
margin=dict(
|
333
|
+
l=100, # 左边距
|
334
|
+
r=100,
|
335
|
+
t=100, # 上边距
|
336
|
+
b=100,
|
337
|
+
),
|
338
|
+
legend=dict(
|
339
|
+
# title='Legend Title', # 图例标题
|
340
|
+
orientation='v', # 图例方向('h' 表示水平,'v' 表示垂直)
|
341
|
+
# x=0.5, # 图例在图表中的 x 位置(0 到 1 的比例)
|
342
|
+
# y=1.02, # 图例在图表中的 y 位置(稍微超出顶部以避免遮挡数据)
|
343
|
+
font=dict(
|
344
|
+
size=12 # 图例字体大小
|
345
|
+
)
|
346
|
+
)
|
347
|
+
)
|
348
|
+
fig.write_html(os.path.join(self.path, f'{filename}.html'))
|
349
|
+
|
215
350
|
def tg(self, db_name='聚合数据', table_name='多店推广场景_按日聚合', pro_list=None, filename='多店推广场景', days=None, start_date=None, end_date=None):
|
216
351
|
"""
|
217
352
|
:param db_name:
|
@@ -243,6 +378,8 @@ class DataShow:
|
|
243
378
|
df = df[df['日期'] >= st_date(num=7)]
|
244
379
|
|
245
380
|
df = df.groupby(['日期', '店铺名称', '营销场景'], as_index=False).agg(**{'花费': ('花费', np.sum), '成交金额': ('成交金额', np.sum)})
|
381
|
+
max_date = df['日期'].max().strftime('%Y-%m-%d')
|
382
|
+
min_date = df['日期'].min().strftime('%Y-%m-%d')
|
246
383
|
df_other = df.groupby(['店铺名称'], as_index=False).agg(**{'花费': ('花费', np.sum)})
|
247
384
|
df_other = df_other.sort_values('花费', ascending=False)
|
248
385
|
data_list = []
|
@@ -300,8 +437,18 @@ class DataShow:
|
|
300
437
|
return fig
|
301
438
|
|
302
439
|
fig = make_sub(data_list=data_list)
|
440
|
+
fig.add_annotation(
|
441
|
+
text=f'统计范围: {min_date} ~ {max_date}',
|
442
|
+
x=0.5,
|
443
|
+
y=-0.15,
|
444
|
+
xref='paper', # # 相对于整个图表区域
|
445
|
+
yref='paper',
|
446
|
+
showarrow=False, # 显示箭头
|
447
|
+
align="left", # 文本对齐方式
|
448
|
+
font=dict(size=14),
|
449
|
+
)
|
303
450
|
fig.update_layout(
|
304
|
-
title_text='多店推广花费_按日聚合',
|
451
|
+
title_text=f'多店推广花费_按日聚合',
|
305
452
|
xaxis_title='日期',
|
306
453
|
yaxis_title='花费',
|
307
454
|
# width=self.screen_width // 1.4,
|
@@ -330,64 +477,97 @@ class DataShow:
|
|
330
477
|
count += 1
|
331
478
|
fig.write_html(os.path.join(self.path, f'{filename}.html'))
|
332
479
|
|
333
|
-
def item_crowd(self, db_name='商品人群画像2', table_list=None, pro_list=None, filename='商品人群画像', item_id=None):
|
480
|
+
def item_crowd(self, db_name='商品人群画像2', table_list=None, pro_list=None, filename='商品人群画像', item_id=None, lab='全部渠道', option='商详浏览', d_str='近30天', last_date=None):
|
481
|
+
# item_ids = [696017020186, 714066010148, 830890472575]
|
334
482
|
if not pro_list:
|
335
483
|
pro_list = ['日期', '店铺名称', '洞察类型', '行为类型', '商品id', '统计周期', '标签名称', '标签人群数量']
|
336
484
|
if not table_list:
|
337
|
-
table_list = [
|
485
|
+
table_list = [
|
486
|
+
'消费能力等级',
|
487
|
+
'用户年龄',
|
488
|
+
'月均消费金额',
|
489
|
+
'大快消策略人群',
|
490
|
+
'店铺潜新老客',
|
491
|
+
'城市等级',
|
492
|
+
'用户职业',
|
493
|
+
]
|
494
|
+
if not item_id:
|
495
|
+
item_id = 696017020186
|
496
|
+
dict_list = {}
|
338
497
|
for table_name in table_list:
|
339
498
|
df = self.getdata(db_name=db_name, table_name=table_name, pro_list=pro_list)
|
340
|
-
|
341
|
-
|
342
|
-
|
499
|
+
if len(df) == 0:
|
500
|
+
print(f'{table_name}: 数据长度不能为 0')
|
501
|
+
continue
|
502
|
+
df['日期'] = pd.to_datetime(df['日期'])
|
503
|
+
|
504
|
+
df['商品id'] = df['商品id'].astype('int64')
|
343
505
|
df = df[df['商品id'] == int(item_id)]
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
506
|
+
# 对数据进行筛选
|
507
|
+
df = df[
|
508
|
+
~df['标签名称'].str.contains('unknown', case=False) &
|
509
|
+
(df['洞察类型'] == lab) &
|
510
|
+
(df['行为类型'] == option) &
|
511
|
+
(df['统计周期'] == d_str)
|
512
|
+
]
|
513
|
+
dict_list.update({table_name: df})
|
350
514
|
|
351
515
|
fig = make_subplots(rows=2, cols=3)
|
352
516
|
# 在每个子图中绘制柱形图
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
517
|
+
count = 0
|
518
|
+
sv_date = {}
|
519
|
+
for table_name, df in dict_list.items():
|
520
|
+
if len(df) == 0:
|
521
|
+
count += 1
|
522
|
+
continue
|
523
|
+
# print(count, table_name)
|
524
|
+
if count > 5:
|
525
|
+
break
|
526
|
+
last_date = df['日期'].max()
|
527
|
+
sv_date.update({table_name: last_date.strftime('%Y-%m-%d')})
|
528
|
+
df = df[df['日期'] == last_date]
|
529
|
+
# 先进行排序,以便柱形图从高到底
|
530
|
+
df.sort_values(['标签人群数量'], ascending=[False], ignore_index=True, inplace=True)
|
531
|
+
labels = df['标签名称'].tolist() # 由于上面有自定义排序,labels 和 values 要放在一起
|
532
|
+
values = df['标签人群数量'].tolist()
|
533
|
+
df['Percentage'] = df['标签人群数量'] / df['标签人群数量'].sum() * 100
|
534
|
+
percentages = df['Percentage']
|
359
535
|
bar = go.Bar(
|
360
536
|
x=labels,
|
361
537
|
y=values,
|
362
|
-
name=
|
538
|
+
name=table_name,
|
539
|
+
orientation='v', # 垂直柱形图
|
363
540
|
text=percentages.map('{:.2f}%'.format), # 设置要显示的文本(百分比)
|
364
541
|
# textposition = 'outside', # 设置文本位置在柱形图外部
|
542
|
+
width=0.55 # 调整柱子最大宽度
|
365
543
|
)
|
544
|
+
row = count // 3 + 1
|
545
|
+
col = count % 3 + 1
|
366
546
|
fig.add_trace(
|
367
547
|
bar,
|
368
|
-
row=
|
369
|
-
col=
|
548
|
+
row=row,
|
549
|
+
col=col,
|
370
550
|
)
|
371
551
|
if count < 3:
|
372
|
-
x = 0.01 + 0.
|
552
|
+
x = 0.01 + 0.385 * (count)
|
373
553
|
y = 1.04
|
374
554
|
else:
|
375
|
-
x = 0.01 + 0.
|
555
|
+
x = 0.01 + 0.385 * (count % 3)
|
376
556
|
y = 1.04 - 0.59 * (count // 3)
|
377
557
|
fig.add_annotation(
|
378
|
-
text=
|
558
|
+
text=f'{table_name}',
|
379
559
|
x=x,
|
380
560
|
y=y,
|
381
561
|
xref='paper', # # 相对于整个图表区域
|
382
562
|
yref='paper',
|
383
563
|
showarrow=False, # 显示箭头
|
384
564
|
align="left", # 文本对齐方式
|
385
|
-
font=dict(size=
|
565
|
+
font=dict(size=15),
|
386
566
|
)
|
387
|
-
|
388
|
-
|
567
|
+
count += 1
|
568
|
+
|
389
569
|
fig.update_layout(
|
390
|
-
title_text=db_name,
|
570
|
+
title_text=f'{db_name} 商品id: {item_id}',
|
391
571
|
xaxis_title='标签',
|
392
572
|
yaxis_title='人群数量',
|
393
573
|
# width=self.screen_width // 1.4,
|
@@ -396,36 +576,190 @@ class DataShow:
|
|
396
576
|
l=100, # 左边距
|
397
577
|
r=100,
|
398
578
|
t=100, # 上边距
|
399
|
-
b=
|
579
|
+
b=100,
|
400
580
|
),
|
401
581
|
# legend=dict(orientation="h")
|
402
582
|
)
|
403
|
-
|
404
|
-
|
583
|
+
fig.add_annotation(
|
584
|
+
text=f'统计范围: {lab}/{option} {d_str}',
|
585
|
+
x=0.5,
|
586
|
+
y=-0.1,
|
587
|
+
xref='paper', # # 相对于整个图表区域
|
588
|
+
yref='paper',
|
589
|
+
showarrow=False, # 显示箭头
|
590
|
+
align="left", # 文本对齐方式
|
591
|
+
font=dict(size=14),
|
592
|
+
)
|
593
|
+
fig.add_annotation(
|
594
|
+
text=re.sub('[{}\',]', '', str(sv_date)),
|
595
|
+
x=0.5,
|
596
|
+
y=-0.135,
|
597
|
+
xref='paper', # # 相对于整个图表区域
|
598
|
+
yref='paper',
|
599
|
+
showarrow=False, # 显示箭头
|
600
|
+
align="left", # 文本对齐方式
|
601
|
+
font=dict(size=12),
|
602
|
+
)
|
603
|
+
fig.write_html(os.path.join(self.path, f'{filename}.html'))
|
604
|
+
|
605
|
+
def crowd(self, db_name='人群画像2', table_list=None, pro_list=None, filename='达摩盘人群画像', crowd_id=None, last_date=None):
|
606
|
+
# item_ids = [696017020186, 714066010148, 830890472575]
|
607
|
+
if not pro_list:
|
608
|
+
pro_list = ['日期', '店铺名称', '人群id', '人群名称', '标签名称', '标签人群数量']
|
609
|
+
if not table_list:
|
610
|
+
table_list = [
|
611
|
+
'消费能力等级',
|
612
|
+
'用户年龄',
|
613
|
+
'月均消费金额',
|
614
|
+
'大快消策略人群',
|
615
|
+
'店铺潜新老客',
|
616
|
+
'城市等级',
|
617
|
+
'用户职业',
|
618
|
+
]
|
619
|
+
if not crowd_id:
|
620
|
+
crowd_id = 40457369
|
621
|
+
|
622
|
+
dict_list = {}
|
623
|
+
for table_name in table_list:
|
624
|
+
df = self.getdata(db_name=db_name, table_name=table_name, pro_list=pro_list)
|
625
|
+
if len(df) == 0:
|
626
|
+
print(f'{table_name}: 数据长度不能为 0')
|
627
|
+
continue
|
628
|
+
df['日期'] = pd.to_datetime(df['日期'])
|
629
|
+
|
630
|
+
df['人群id'] = df['人群id'].astype('int64')
|
631
|
+
df = df[df['人群id'] == int(crowd_id)]
|
632
|
+
# 对数据进行筛选
|
633
|
+
df = df[
|
634
|
+
(df['店铺名称'] == '万里马官方旗舰店')
|
635
|
+
# ~df['标签名称'].str.contains('unknown', case=False)
|
636
|
+
]
|
637
|
+
dict_list.update({table_name: df})
|
638
|
+
crowd_name = df.head(1)['人群名称'].tolist()[0] # 随便取一条数据读取人群名称
|
639
|
+
fig = make_subplots(rows=2, cols=3)
|
640
|
+
# 在每个子图中绘制柱形图
|
641
|
+
count = 0
|
642
|
+
sv_date = {}
|
643
|
+
unknown_dict = {}
|
644
|
+
for table_name, df in dict_list.items():
|
645
|
+
if len(df) == 0:
|
646
|
+
count += 1
|
647
|
+
continue
|
648
|
+
# print(count, table_name)
|
649
|
+
if count > 5:
|
650
|
+
break
|
651
|
+
last_date = df['日期'].max()
|
652
|
+
df = df[df['日期'] == last_date]
|
653
|
+
unknown = df[df['标签名称'].str.contains('unknown', case=False)]
|
654
|
+
if len(unknown) > 0:
|
655
|
+
unknown = unknown['标签人群数量'].tolist()[0] # 未知人群数量值
|
656
|
+
|
657
|
+
df = df[~df['标签名称'].str.contains('unknown', case=False)]
|
658
|
+
# 先进行排序,以便柱形图从高到底
|
659
|
+
df.sort_values(['标签人群数量'], ascending=[False], ignore_index=True, inplace=True)
|
660
|
+
labels = df['标签名称'].tolist() # 由于上面有自定义排序,labels 和 values 要放在一起
|
661
|
+
values = df['标签人群数量'].tolist()
|
662
|
+
crowd_sum = df['标签人群数量'].values.sum()
|
663
|
+
sv_date.update({table_name: crowd_sum})
|
664
|
+
unknown_dict.update({table_name: unknown})
|
665
|
+
df['Percentage'] = df['标签人群数量'] / df['标签人群数量'].sum() * 100
|
666
|
+
percentages = df['Percentage']
|
667
|
+
bar = go.Bar(
|
668
|
+
x=labels,
|
669
|
+
y=values,
|
670
|
+
name=table_name,
|
671
|
+
orientation='v', # 垂直柱形图
|
672
|
+
text=percentages.map('{:.2f}%'.format), # 设置要显示的文本(百分比)
|
673
|
+
# textposition = 'outside', # 设置文本位置在柱形图外部
|
674
|
+
width=0.55 # 调整柱子最大宽度
|
675
|
+
)
|
676
|
+
row = count // 3 + 1
|
677
|
+
col = count % 3 + 1
|
678
|
+
fig.add_trace(
|
679
|
+
bar,
|
680
|
+
row=row,
|
681
|
+
col=col,
|
682
|
+
)
|
683
|
+
if count < 3:
|
684
|
+
x = 0.01 + 0.42 * (count)
|
685
|
+
y = 1.04
|
686
|
+
else:
|
687
|
+
x = 0.01 + 0.42 * (count % 3)
|
688
|
+
y = 1.04 - 0.59 * (count // 3)
|
405
689
|
fig.add_annotation(
|
406
|
-
text=f'
|
407
|
-
x=
|
408
|
-
y=
|
690
|
+
text=f'{table_name} 人群数量: {crowd_sum}',
|
691
|
+
x=x,
|
692
|
+
y=y,
|
409
693
|
xref='paper', # # 相对于整个图表区域
|
410
694
|
yref='paper',
|
411
695
|
showarrow=False, # 显示箭头
|
412
696
|
align="left", # 文本对齐方式
|
413
|
-
font=dict(size=
|
697
|
+
font=dict(size=15),
|
414
698
|
)
|
415
699
|
count += 1
|
700
|
+
|
701
|
+
fig.update_layout(
|
702
|
+
title_text=f'达摩盘人群画像 人群id: {crowd_id} / 人群名字: 【{crowd_name}】',
|
703
|
+
xaxis_title='标签',
|
704
|
+
yaxis_title='人群数量',
|
705
|
+
# width=self.screen_width // 1.4,
|
706
|
+
# height=self.screen_width // 2,
|
707
|
+
margin=dict(
|
708
|
+
l=100, # 左边距
|
709
|
+
r=100,
|
710
|
+
t=100, # 上边距
|
711
|
+
b=100,
|
712
|
+
),
|
713
|
+
# legend=dict(orientation="h")
|
714
|
+
)
|
715
|
+
res = {}
|
716
|
+
for k, v in sv_date.items():
|
717
|
+
res.update({k: int(v)})
|
718
|
+
unknown_res = {}
|
719
|
+
for k, v in unknown_dict.items():
|
720
|
+
unknown_res.update({k: int(v)})
|
721
|
+
|
722
|
+
fig.add_annotation(
|
723
|
+
text=f'分析人群数量: {re.sub('[{}\',]', '', str(res))}',
|
724
|
+
x=0.5,
|
725
|
+
y=-0.1,
|
726
|
+
xref='paper', # # 相对于整个图表区域
|
727
|
+
yref='paper',
|
728
|
+
showarrow=False, # 显示箭头
|
729
|
+
align="left", # 文本对齐方式
|
730
|
+
font=dict(size=12),
|
731
|
+
)
|
732
|
+
fig.add_annotation(
|
733
|
+
text=f'与官方统计存在差异,官方计算中包含未知人群,数量为: {re.sub('[{}\',]', '', str(unknown_res))},未知人群占比越大,同官方差异越大',
|
734
|
+
x=0.5,
|
735
|
+
y=-0.135,
|
736
|
+
xref='paper', # # 相对于整个图表区域
|
737
|
+
yref='paper',
|
738
|
+
showarrow=False, # 显示箭头
|
739
|
+
align="left", # 文本对齐方式
|
740
|
+
font=dict(size=12),
|
741
|
+
)
|
416
742
|
fig.write_html(os.path.join(self.path, f'{filename}.html'))
|
417
743
|
|
418
744
|
|
419
745
|
def main():
|
420
746
|
ds = DataShow()
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
747
|
+
ds.dpll()
|
748
|
+
ds.tg(
|
749
|
+
days=15,
|
750
|
+
# start_date='2024-11-01',
|
751
|
+
# end_date='2024-11-30',
|
752
|
+
)
|
427
753
|
ds.item_crowd(
|
428
|
-
|
754
|
+
item_id=839148235697,
|
755
|
+
lab='全部渠道',
|
756
|
+
option='商详浏览',
|
757
|
+
last_date=None,
|
758
|
+
d_str='近30天',
|
759
|
+
)
|
760
|
+
ds.crowd(
|
761
|
+
crowd_id=40457166,
|
762
|
+
last_date=None,
|
429
763
|
)
|
430
764
|
|
431
765
|
|
mdbq/aggregation/query_data.py
CHANGED
@@ -482,19 +482,37 @@ class MysqlDatasQuery:
|
|
482
482
|
}
|
483
483
|
)
|
484
484
|
df.insert(loc=1, column='推广渠道', value='万相台无界版') # df中插入新列
|
485
|
+
|
486
|
+
# 开始处理用户特征
|
487
|
+
df_sx = self.download.data_to_df(
|
488
|
+
db_name='达摩盘3',
|
489
|
+
table_name=f'我的人群属性',
|
490
|
+
start_date=start_date,
|
491
|
+
end_date=end_date,
|
492
|
+
projection={'人群名称': 1, '消费能力等级': 1, '用户年龄': 1},
|
493
|
+
)
|
494
|
+
df_sx['人群名称'] = df_sx['人群名称'].apply(lambda x: f'达摩盘:{x}')
|
495
|
+
df_sx.rename(columns={'消费能力等级': '消费力层级'}, inplace=True)
|
496
|
+
df = pd.merge(df, df_sx, left_on=['人群名字'], right_on=['人群名称'], how='left')
|
497
|
+
df.pop('人群名称')
|
498
|
+
df['消费力层级'] = df['消费力层级'].apply(lambda x: f'L{"".join(re.findall(r'L(\d)', str(x)))}' if str(x) != 'nan' else x)
|
499
|
+
df['用户年龄'] = df['用户年龄'].apply(lambda x: "~".join(re.findall(r'(\d{2})\D.*(\d{2})岁', str(x))[0]) if str(x) != 'nan' else x)
|
500
|
+
|
485
501
|
# 1. 匹配 L后面接 2 个或以上数字,不区分大小写,示例:L345
|
486
502
|
# 2. 其余情况,L 后面接多个数字的都会被第一条 if 命中,不区分大小写
|
503
|
+
|
487
504
|
df['消费力层级'] = df.apply(
|
488
505
|
lambda x:
|
489
506
|
''.join(re.findall(r'(l\d+)', x['人群名字'].upper(), re.IGNORECASE)) if re.findall(r'(l\d{2,})',
|
490
507
|
x['人群名字'],
|
491
|
-
re.IGNORECASE)
|
492
|
-
else 'L5' if re.findall(r'(l\d*5)', x['人群名字'], re.IGNORECASE)
|
493
|
-
else 'L4' if re.findall(r'(l\d*4)', x['人群名字'], re.IGNORECASE)
|
494
|
-
else 'L3' if re.findall(r'(l\d*3)', x['人群名字'], re.IGNORECASE)
|
495
|
-
else 'L2' if re.findall(r'(l\d*2)', x['人群名字'], re.IGNORECASE)
|
496
|
-
else 'L1' if re.findall(r'(l\d*1)', x['人群名字'], re.IGNORECASE)
|
497
|
-
else '', axis=1)
|
508
|
+
re.IGNORECASE) and str(x['消费力层级']) == 'nan'
|
509
|
+
else 'L5' if re.findall(r'(l\d*5)', x['人群名字'], re.IGNORECASE) and str(x['消费力层级']) == 'nan'
|
510
|
+
else 'L4' if re.findall(r'(l\d*4)', x['人群名字'], re.IGNORECASE) and str(x['消费力层级']) == 'nan'
|
511
|
+
else 'L3' if re.findall(r'(l\d*3)', x['人群名字'], re.IGNORECASE) and str(x['消费力层级']) == 'nan'
|
512
|
+
else 'L2' if re.findall(r'(l\d*2)', x['人群名字'], re.IGNORECASE) and str(x['消费力层级']) == 'nan'
|
513
|
+
else 'L1' if re.findall(r'(l\d*1)', x['人群名字'], re.IGNORECASE) and str(x['消费力层级']) == 'nan'
|
514
|
+
else x['消费力层级'], axis=1)
|
515
|
+
|
498
516
|
# 1. 匹配连续的 4 个数字且后面不能接数字或"元"或汉字,筛掉的人群示例:月均消费6000元|受众20240729175213|xxx2024真皮公文包
|
499
517
|
# 2. 匹配 2数字_2数字且前面不能是数字,合法匹配:人群_30_50_促; 非法示例:L345_3040 避免识别出 35~20 岁用户的情况
|
500
518
|
# pattern = r'(\d{4})(?!\d|[\u4e00-\u9fa5])' # 匹配 4 个数字,后面不能接数字或汉字
|
@@ -506,21 +524,29 @@ class MysqlDatasQuery:
|
|
506
524
|
pattern2 = r'(?<![\dlL])(\d{2}_\d{2})'
|
507
525
|
df['用户年龄'] = df.apply(
|
508
526
|
lambda x:
|
509
|
-
''.join(re.findall(pattern1, x['人群名字'].upper())) if re.findall(pattern1, x['人群名字'])
|
527
|
+
''.join(re.findall(pattern1, x['人群名字'].upper())) if re.findall(pattern1, x['人群名字']) and str(x['用户年龄']) == 'nan'
|
510
528
|
# else ''.join(re.findall(r'[^\d|l|L](\d{2}_\d{2})', x['人群名字'].upper())) if re.findall(r'[^\d|l|L](\d{2}_\d{2})', x['人群名字'])
|
511
|
-
else ''.join(re.findall(pattern2, x['人群名字'].upper())) if re.findall(pattern2, x['人群名字'])
|
529
|
+
else ''.join(re.findall(pattern2, x['人群名字'].upper())) if re.findall(pattern2, x['人群名字']) and str(x['用户年龄']) == 'nan'
|
512
530
|
else ''.join(re.findall(r'(\d{2}-\d{2})岁', x['人群名字'].upper())) if re.findall(r'(\d{2}-\d{2})岁',
|
513
|
-
x['人群名字'])
|
514
|
-
else '', axis=1)
|
531
|
+
x['人群名字']) and str(x['用户年龄']) == 'nan'
|
532
|
+
else x['用户年龄'], axis=1)
|
515
533
|
df['用户年龄'] = df['用户年龄'].apply(
|
516
534
|
lambda x: f'{x[:2]}~{x[2:4]}' if str(x).isdigit()
|
517
|
-
else str(x).replace('_', '~') if '_' in x
|
518
|
-
else str(x).replace('-', '~') if '-' in x
|
535
|
+
else str(x).replace('_', '~') if '_' in str(x)
|
536
|
+
else str(x).replace('-', '~') if '-' in str(x)
|
519
537
|
else x
|
520
538
|
)
|
521
539
|
# 年龄层不能是 0 开头
|
522
540
|
df['用户年龄'] = df['用户年龄'].apply(
|
523
541
|
lambda x: '' if str(x).startswith('0') else x)
|
542
|
+
df['用户年龄'] = df['用户年龄'].apply(
|
543
|
+
lambda x:
|
544
|
+
re.sub(f'~50', '~49' ,str(x)) if '~50' in str(x) else
|
545
|
+
re.sub(f'~40', '~39', str(x)) if '~40' in str(x) else
|
546
|
+
re.sub(f'~30', '~29' ,str(x)) if '~30' in str(x) else
|
547
|
+
re.sub(r'\d{4}~', '', str(x)) if str(x) != 'nan' else
|
548
|
+
x
|
549
|
+
)
|
524
550
|
# df = df.head(1000)
|
525
551
|
# df.to_csv('/Users/xigua/Downloads/test.csv', index=False, header=True, encoding='utf-8_sig')
|
526
552
|
# breakpoint()
|
@@ -3809,6 +3835,10 @@ if __name__ == '__main__':
|
|
3809
3835
|
# query3(months=2, less_dict=[])
|
3810
3836
|
|
3811
3837
|
sdq = MysqlDatasQuery() # 实例化数据处理类
|
3812
|
-
sdq.months =
|
3838
|
+
sdq.months = 1 # 设置数据周期, 1 表示近 2 个月
|
3813
3839
|
sdq.update_service = True # 调试时加,true: 将数据写入 mysql 服务器
|
3814
|
-
sdq.
|
3840
|
+
sdq.tg_rqbb(db_name='聚合数据', table_name='天猫_人群报表')
|
3841
|
+
|
3842
|
+
# string = '30-34岁,35-39岁,40-49岁'
|
3843
|
+
# d = "~".join(re.findall(r'(\d+)\D.*\D(\d+)岁', string)[0])
|
3844
|
+
# print(d)
|
@@ -2,9 +2,9 @@ mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
|
2
2
|
mdbq/__version__.py,sha256=y9Mp_8x0BCZSHsdLT_q5tX9wZwd5QgqrSIENLrb6vXA,62
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/aggregation.py,sha256=-yzApnlqSN2L0E1YMu5ml-W827qpKQvWPCOI7jj2kzY,80264
|
5
|
-
mdbq/aggregation/datashow.py,sha256=
|
5
|
+
mdbq/aggregation/datashow.py,sha256=2NzHGjGoUy2WG-MxmbilCj6KBAmVah3jqFuEd2zv9XU,32379
|
6
6
|
mdbq/aggregation/optimize_data.py,sha256=RXIv7cACCgYyehAxMjUYi_S7rVyjIwXKWMaM3nduGtA,3068
|
7
|
-
mdbq/aggregation/query_data.py,sha256=
|
7
|
+
mdbq/aggregation/query_data.py,sha256=FcwaYUom2UGqCRsuGgwfuVdnY86PUOzkCivyoCY2oVQ,175663
|
8
8
|
mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
|
9
9
|
mdbq/bdup/bdup.py,sha256=LAV0TgnQpc-LB-YuJthxb0U42_VkPidzQzAagan46lU,4234
|
10
10
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -34,7 +34,7 @@ mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
|
|
34
34
|
mdbq/pbix/refresh_all_old.py,sha256=_pq3WSQ728GPtEG5pfsZI2uTJhU8D6ra-htIk1JXYzw,7192
|
35
35
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
36
36
|
mdbq/spider/aikucun.py,sha256=eAIITxnbbxsR_EoohJ78CRw2dEdfSHOltfpxBrh0cvc,22207
|
37
|
-
mdbq-3.3.
|
38
|
-
mdbq-3.3.
|
39
|
-
mdbq-3.3.
|
40
|
-
mdbq-3.3.
|
37
|
+
mdbq-3.3.16.dist-info/METADATA,sha256=c2t76yzpaP9kkwDg5y3Ooam9oYe6p4ntlKjWFUjZ464,244
|
38
|
+
mdbq-3.3.16.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
39
|
+
mdbq-3.3.16.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
40
|
+
mdbq-3.3.16.dist-info/RECORD,,
|
File without changes
|
File without changes
|